You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
412 lines
12 KiB
412 lines
12 KiB
// Copyright 2013 The Go Authors. All rights reserved. |
|
// Use of this source code is governed by a BSD-style |
|
// license that can be found in the LICENSE file. |
|
|
|
package language |
|
|
|
import ( |
|
"bytes" |
|
"fmt" |
|
"sort" |
|
"strconv" |
|
|
|
"golang.org/x/text/internal/tag" |
|
) |
|
|
|
// findIndex tries to find the given tag in idx and returns a standardized error |
|
// if it could not be found. |
|
func findIndex(idx tag.Index, key []byte, form string) (index int, err error) { |
|
if !tag.FixCase(form, key) { |
|
return 0, ErrSyntax |
|
} |
|
i := idx.Index(key) |
|
if i == -1 { |
|
return 0, NewValueError(key) |
|
} |
|
return i, nil |
|
} |
|
|
|
func searchUint(imap []uint16, key uint16) int { |
|
return sort.Search(len(imap), func(i int) bool { |
|
return imap[i] >= key |
|
}) |
|
} |
|
|
|
type Language uint16 |
|
|
|
// getLangID returns the langID of s if s is a canonical subtag |
|
// or langUnknown if s is not a canonical subtag. |
|
func getLangID(s []byte) (Language, error) { |
|
if len(s) == 2 { |
|
return getLangISO2(s) |
|
} |
|
return getLangISO3(s) |
|
} |
|
|
|
// TODO language normalization as well as the AliasMaps could be moved to the |
|
// higher level package, but it is a bit tricky to separate the generation. |
|
|
|
func (id Language) Canonicalize() (Language, AliasType) { |
|
return normLang(id) |
|
} |
|
|
|
// mapLang returns the mapped langID of id according to mapping m. |
|
func normLang(id Language) (Language, AliasType) { |
|
k := sort.Search(len(AliasMap), func(i int) bool { |
|
return AliasMap[i].From >= uint16(id) |
|
}) |
|
if k < len(AliasMap) && AliasMap[k].From == uint16(id) { |
|
return Language(AliasMap[k].To), AliasTypes[k] |
|
} |
|
return id, AliasTypeUnknown |
|
} |
|
|
|
// getLangISO2 returns the langID for the given 2-letter ISO language code |
|
// or unknownLang if this does not exist. |
|
func getLangISO2(s []byte) (Language, error) { |
|
if !tag.FixCase("zz", s) { |
|
return 0, ErrSyntax |
|
} |
|
if i := lang.Index(s); i != -1 && lang.Elem(i)[3] != 0 { |
|
return Language(i), nil |
|
} |
|
return 0, NewValueError(s) |
|
} |
|
|
|
const base = 'z' - 'a' + 1 |
|
|
|
func strToInt(s []byte) uint { |
|
v := uint(0) |
|
for i := 0; i < len(s); i++ { |
|
v *= base |
|
v += uint(s[i] - 'a') |
|
} |
|
return v |
|
} |
|
|
|
// converts the given integer to the original ASCII string passed to strToInt. |
|
// len(s) must match the number of characters obtained. |
|
func intToStr(v uint, s []byte) { |
|
for i := len(s) - 1; i >= 0; i-- { |
|
s[i] = byte(v%base) + 'a' |
|
v /= base |
|
} |
|
} |
|
|
|
// getLangISO3 returns the langID for the given 3-letter ISO language code |
|
// or unknownLang if this does not exist. |
|
func getLangISO3(s []byte) (Language, error) { |
|
if tag.FixCase("und", s) { |
|
// first try to match canonical 3-letter entries |
|
for i := lang.Index(s[:2]); i != -1; i = lang.Next(s[:2], i) { |
|
if e := lang.Elem(i); e[3] == 0 && e[2] == s[2] { |
|
// We treat "und" as special and always translate it to "unspecified". |
|
// Note that ZZ and Zzzz are private use and are not treated as |
|
// unspecified by default. |
|
id := Language(i) |
|
if id == nonCanonicalUnd { |
|
return 0, nil |
|
} |
|
return id, nil |
|
} |
|
} |
|
if i := altLangISO3.Index(s); i != -1 { |
|
return Language(altLangIndex[altLangISO3.Elem(i)[3]]), nil |
|
} |
|
n := strToInt(s) |
|
if langNoIndex[n/8]&(1<<(n%8)) != 0 { |
|
return Language(n) + langNoIndexOffset, nil |
|
} |
|
// Check for non-canonical uses of ISO3. |
|
for i := lang.Index(s[:1]); i != -1; i = lang.Next(s[:1], i) { |
|
if e := lang.Elem(i); e[2] == s[1] && e[3] == s[2] { |
|
return Language(i), nil |
|
} |
|
} |
|
return 0, NewValueError(s) |
|
} |
|
return 0, ErrSyntax |
|
} |
|
|
|
// StringToBuf writes the string to b and returns the number of bytes |
|
// written. cap(b) must be >= 3. |
|
func (id Language) StringToBuf(b []byte) int { |
|
if id >= langNoIndexOffset { |
|
intToStr(uint(id)-langNoIndexOffset, b[:3]) |
|
return 3 |
|
} else if id == 0 { |
|
return copy(b, "und") |
|
} |
|
l := lang[id<<2:] |
|
if l[3] == 0 { |
|
return copy(b, l[:3]) |
|
} |
|
return copy(b, l[:2]) |
|
} |
|
|
|
// String returns the BCP 47 representation of the langID. |
|
// Use b as variable name, instead of id, to ensure the variable |
|
// used is consistent with that of Base in which this type is embedded. |
|
func (b Language) String() string { |
|
if b == 0 { |
|
return "und" |
|
} else if b >= langNoIndexOffset { |
|
b -= langNoIndexOffset |
|
buf := [3]byte{} |
|
intToStr(uint(b), buf[:]) |
|
return string(buf[:]) |
|
} |
|
l := lang.Elem(int(b)) |
|
if l[3] == 0 { |
|
return l[:3] |
|
} |
|
return l[:2] |
|
} |
|
|
|
// ISO3 returns the ISO 639-3 language code. |
|
func (b Language) ISO3() string { |
|
if b == 0 || b >= langNoIndexOffset { |
|
return b.String() |
|
} |
|
l := lang.Elem(int(b)) |
|
if l[3] == 0 { |
|
return l[:3] |
|
} else if l[2] == 0 { |
|
return altLangISO3.Elem(int(l[3]))[:3] |
|
} |
|
// This allocation will only happen for 3-letter ISO codes |
|
// that are non-canonical BCP 47 language identifiers. |
|
return l[0:1] + l[2:4] |
|
} |
|
|
|
// IsPrivateUse reports whether this language code is reserved for private use. |
|
func (b Language) IsPrivateUse() bool { |
|
return langPrivateStart <= b && b <= langPrivateEnd |
|
} |
|
|
|
// SuppressScript returns the script marked as SuppressScript in the IANA |
|
// language tag repository, or 0 if there is no such script. |
|
func (b Language) SuppressScript() Script { |
|
if b < langNoIndexOffset { |
|
return Script(suppressScript[b]) |
|
} |
|
return 0 |
|
} |
|
|
|
type Region uint16 |
|
|
|
// getRegionID returns the region id for s if s is a valid 2-letter region code |
|
// or unknownRegion. |
|
func getRegionID(s []byte) (Region, error) { |
|
if len(s) == 3 { |
|
if isAlpha(s[0]) { |
|
return getRegionISO3(s) |
|
} |
|
if i, err := strconv.ParseUint(string(s), 10, 10); err == nil { |
|
return getRegionM49(int(i)) |
|
} |
|
} |
|
return getRegionISO2(s) |
|
} |
|
|
|
// getRegionISO2 returns the regionID for the given 2-letter ISO country code |
|
// or unknownRegion if this does not exist. |
|
func getRegionISO2(s []byte) (Region, error) { |
|
i, err := findIndex(regionISO, s, "ZZ") |
|
if err != nil { |
|
return 0, err |
|
} |
|
return Region(i) + isoRegionOffset, nil |
|
} |
|
|
|
// getRegionISO3 returns the regionID for the given 3-letter ISO country code |
|
// or unknownRegion if this does not exist. |
|
func getRegionISO3(s []byte) (Region, error) { |
|
if tag.FixCase("ZZZ", s) { |
|
for i := regionISO.Index(s[:1]); i != -1; i = regionISO.Next(s[:1], i) { |
|
if e := regionISO.Elem(i); e[2] == s[1] && e[3] == s[2] { |
|
return Region(i) + isoRegionOffset, nil |
|
} |
|
} |
|
for i := 0; i < len(altRegionISO3); i += 3 { |
|
if tag.Compare(altRegionISO3[i:i+3], s) == 0 { |
|
return Region(altRegionIDs[i/3]), nil |
|
} |
|
} |
|
return 0, NewValueError(s) |
|
} |
|
return 0, ErrSyntax |
|
} |
|
|
|
func getRegionM49(n int) (Region, error) { |
|
if 0 < n && n <= 999 { |
|
const ( |
|
searchBits = 7 |
|
regionBits = 9 |
|
regionMask = 1<<regionBits - 1 |
|
) |
|
idx := n >> searchBits |
|
buf := fromM49[m49Index[idx]:m49Index[idx+1]] |
|
val := uint16(n) << regionBits // we rely on bits shifting out |
|
i := sort.Search(len(buf), func(i int) bool { |
|
return buf[i] >= val |
|
}) |
|
if r := fromM49[int(m49Index[idx])+i]; r&^regionMask == val { |
|
return Region(r & regionMask), nil |
|
} |
|
} |
|
var e ValueError |
|
fmt.Fprint(bytes.NewBuffer([]byte(e.v[:])), n) |
|
return 0, e |
|
} |
|
|
|
// normRegion returns a region if r is deprecated or 0 otherwise. |
|
// TODO: consider supporting BYS (-> BLR), CSK (-> 200 or CZ), PHI (-> PHL) and AFI (-> DJ). |
|
// TODO: consider mapping split up regions to new most populous one (like CLDR). |
|
func normRegion(r Region) Region { |
|
m := regionOldMap |
|
k := sort.Search(len(m), func(i int) bool { |
|
return m[i].From >= uint16(r) |
|
}) |
|
if k < len(m) && m[k].From == uint16(r) { |
|
return Region(m[k].To) |
|
} |
|
return 0 |
|
} |
|
|
|
const ( |
|
iso3166UserAssigned = 1 << iota |
|
ccTLD |
|
bcp47Region |
|
) |
|
|
|
func (r Region) typ() byte { |
|
return regionTypes[r] |
|
} |
|
|
|
// String returns the BCP 47 representation for the region. |
|
// It returns "ZZ" for an unspecified region. |
|
func (r Region) String() string { |
|
if r < isoRegionOffset { |
|
if r == 0 { |
|
return "ZZ" |
|
} |
|
return fmt.Sprintf("%03d", r.M49()) |
|
} |
|
r -= isoRegionOffset |
|
return regionISO.Elem(int(r))[:2] |
|
} |
|
|
|
// ISO3 returns the 3-letter ISO code of r. |
|
// Note that not all regions have a 3-letter ISO code. |
|
// In such cases this method returns "ZZZ". |
|
func (r Region) ISO3() string { |
|
if r < isoRegionOffset { |
|
return "ZZZ" |
|
} |
|
r -= isoRegionOffset |
|
reg := regionISO.Elem(int(r)) |
|
switch reg[2] { |
|
case 0: |
|
return altRegionISO3[reg[3]:][:3] |
|
case ' ': |
|
return "ZZZ" |
|
} |
|
return reg[0:1] + reg[2:4] |
|
} |
|
|
|
// M49 returns the UN M.49 encoding of r, or 0 if this encoding |
|
// is not defined for r. |
|
func (r Region) M49() int { |
|
return int(m49[r]) |
|
} |
|
|
|
// IsPrivateUse reports whether r has the ISO 3166 User-assigned status. This |
|
// may include private-use tags that are assigned by CLDR and used in this |
|
// implementation. So IsPrivateUse and IsCountry can be simultaneously true. |
|
func (r Region) IsPrivateUse() bool { |
|
return r.typ()&iso3166UserAssigned != 0 |
|
} |
|
|
|
type Script uint8 |
|
|
|
// getScriptID returns the script id for string s. It assumes that s |
|
// is of the format [A-Z][a-z]{3}. |
|
func getScriptID(idx tag.Index, s []byte) (Script, error) { |
|
i, err := findIndex(idx, s, "Zzzz") |
|
return Script(i), err |
|
} |
|
|
|
// String returns the script code in title case. |
|
// It returns "Zzzz" for an unspecified script. |
|
func (s Script) String() string { |
|
if s == 0 { |
|
return "Zzzz" |
|
} |
|
return script.Elem(int(s)) |
|
} |
|
|
|
// IsPrivateUse reports whether this script code is reserved for private use. |
|
func (s Script) IsPrivateUse() bool { |
|
return _Qaaa <= s && s <= _Qabx |
|
} |
|
|
|
const ( |
|
maxAltTaglen = len("en-US-POSIX") |
|
maxLen = maxAltTaglen |
|
) |
|
|
|
var ( |
|
// grandfatheredMap holds a mapping from legacy and grandfathered tags to |
|
// their base language or index to more elaborate tag. |
|
grandfatheredMap = map[[maxLen]byte]int16{ |
|
[maxLen]byte{'a', 'r', 't', '-', 'l', 'o', 'j', 'b', 'a', 'n'}: _jbo, // art-lojban |
|
[maxLen]byte{'i', '-', 'a', 'm', 'i'}: _ami, // i-ami |
|
[maxLen]byte{'i', '-', 'b', 'n', 'n'}: _bnn, // i-bnn |
|
[maxLen]byte{'i', '-', 'h', 'a', 'k'}: _hak, // i-hak |
|
[maxLen]byte{'i', '-', 'k', 'l', 'i', 'n', 'g', 'o', 'n'}: _tlh, // i-klingon |
|
[maxLen]byte{'i', '-', 'l', 'u', 'x'}: _lb, // i-lux |
|
[maxLen]byte{'i', '-', 'n', 'a', 'v', 'a', 'j', 'o'}: _nv, // i-navajo |
|
[maxLen]byte{'i', '-', 'p', 'w', 'n'}: _pwn, // i-pwn |
|
[maxLen]byte{'i', '-', 't', 'a', 'o'}: _tao, // i-tao |
|
[maxLen]byte{'i', '-', 't', 'a', 'y'}: _tay, // i-tay |
|
[maxLen]byte{'i', '-', 't', 's', 'u'}: _tsu, // i-tsu |
|
[maxLen]byte{'n', 'o', '-', 'b', 'o', 'k'}: _nb, // no-bok |
|
[maxLen]byte{'n', 'o', '-', 'n', 'y', 'n'}: _nn, // no-nyn |
|
[maxLen]byte{'s', 'g', 'n', '-', 'b', 'e', '-', 'f', 'r'}: _sfb, // sgn-BE-FR |
|
[maxLen]byte{'s', 'g', 'n', '-', 'b', 'e', '-', 'n', 'l'}: _vgt, // sgn-BE-NL |
|
[maxLen]byte{'s', 'g', 'n', '-', 'c', 'h', '-', 'd', 'e'}: _sgg, // sgn-CH-DE |
|
[maxLen]byte{'z', 'h', '-', 'g', 'u', 'o', 'y', 'u'}: _cmn, // zh-guoyu |
|
[maxLen]byte{'z', 'h', '-', 'h', 'a', 'k', 'k', 'a'}: _hak, // zh-hakka |
|
[maxLen]byte{'z', 'h', '-', 'm', 'i', 'n', '-', 'n', 'a', 'n'}: _nan, // zh-min-nan |
|
[maxLen]byte{'z', 'h', '-', 'x', 'i', 'a', 'n', 'g'}: _hsn, // zh-xiang |
|
|
|
// Grandfathered tags with no modern replacement will be converted as |
|
// follows: |
|
[maxLen]byte{'c', 'e', 'l', '-', 'g', 'a', 'u', 'l', 'i', 's', 'h'}: -1, // cel-gaulish |
|
[maxLen]byte{'e', 'n', '-', 'g', 'b', '-', 'o', 'e', 'd'}: -2, // en-GB-oed |
|
[maxLen]byte{'i', '-', 'd', 'e', 'f', 'a', 'u', 'l', 't'}: -3, // i-default |
|
[maxLen]byte{'i', '-', 'e', 'n', 'o', 'c', 'h', 'i', 'a', 'n'}: -4, // i-enochian |
|
[maxLen]byte{'i', '-', 'm', 'i', 'n', 'g', 'o'}: -5, // i-mingo |
|
[maxLen]byte{'z', 'h', '-', 'm', 'i', 'n'}: -6, // zh-min |
|
|
|
// CLDR-specific tag. |
|
[maxLen]byte{'r', 'o', 'o', 't'}: 0, // root |
|
[maxLen]byte{'e', 'n', '-', 'u', 's', '-', 'p', 'o', 's', 'i', 'x'}: -7, // en_US_POSIX" |
|
} |
|
|
|
altTagIndex = [...]uint8{0, 17, 31, 45, 61, 74, 86, 102} |
|
|
|
altTags = "xtg-x-cel-gaulishen-GB-oxendicten-x-i-defaultund-x-i-enochiansee-x-i-mingonan-x-zh-minen-US-u-va-posix" |
|
) |
|
|
|
func grandfathered(s [maxAltTaglen]byte) (t Tag, ok bool) { |
|
if v, ok := grandfatheredMap[s]; ok { |
|
if v < 0 { |
|
return Make(altTags[altTagIndex[-v-1]:altTagIndex[-v]]), true |
|
} |
|
t.LangID = Language(v) |
|
return t, true |
|
} |
|
return t, false |
|
}
|
|
|