You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
816 lines
23 KiB
816 lines
23 KiB
// Copyright 2014 The Go Authors. All rights reserved. |
|
// Use of this source code is governed by a BSD-style |
|
// license that can be found in the LICENSE file. |
|
|
|
package cases |
|
|
|
// This file contains the definitions of case mappings for all supported |
|
// languages. The rules for the language-specific tailorings were taken and |
|
// modified from the CLDR transform definitions in common/transforms. |
|
|
|
import ( |
|
"strings" |
|
"unicode" |
|
"unicode/utf8" |
|
|
|
"golang.org/x/text/internal" |
|
"golang.org/x/text/language" |
|
"golang.org/x/text/transform" |
|
"golang.org/x/text/unicode/norm" |
|
) |
|
|
|
// A mapFunc takes a context set to the current rune and writes the mapped |
|
// version to the same context. It may advance the context to the next rune. It |
|
// returns whether a checkpoint is possible: whether the pDst bytes written to |
|
// dst so far won't need changing as we see more source bytes. |
|
type mapFunc func(*context) bool |
|
|
|
// A spanFunc takes a context set to the current rune and returns whether this |
|
// rune would be altered when written to the output. It may advance the context |
|
// to the next rune. It returns whether a checkpoint is possible. |
|
type spanFunc func(*context) bool |
|
|
|
// maxIgnorable defines the maximum number of ignorables to consider for |
|
// lookahead operations. |
|
const maxIgnorable = 30 |
|
|
|
// supported lists the language tags for which we have tailorings. |
|
const supported = "und af az el lt nl tr" |
|
|
|
func init() { |
|
tags := []language.Tag{} |
|
for _, s := range strings.Split(supported, " ") { |
|
tags = append(tags, language.MustParse(s)) |
|
} |
|
matcher = internal.NewInheritanceMatcher(tags) |
|
Supported = language.NewCoverage(tags) |
|
} |
|
|
|
var ( |
|
matcher *internal.InheritanceMatcher |
|
|
|
Supported language.Coverage |
|
|
|
// We keep the following lists separate, instead of having a single per- |
|
// language struct, to give the compiler a chance to remove unused code. |
|
|
|
// Some uppercase mappers are stateless, so we can precompute the |
|
// Transformers and save a bit on runtime allocations. |
|
upperFunc = []struct { |
|
upper mapFunc |
|
span spanFunc |
|
}{ |
|
{nil, nil}, // und |
|
{nil, nil}, // af |
|
{aztrUpper(upper), isUpper}, // az |
|
{elUpper, noSpan}, // el |
|
{ltUpper(upper), noSpan}, // lt |
|
{nil, nil}, // nl |
|
{aztrUpper(upper), isUpper}, // tr |
|
} |
|
|
|
undUpper transform.SpanningTransformer = &undUpperCaser{} |
|
undLower transform.SpanningTransformer = &undLowerCaser{} |
|
undLowerIgnoreSigma transform.SpanningTransformer = &undLowerIgnoreSigmaCaser{} |
|
|
|
lowerFunc = []mapFunc{ |
|
nil, // und |
|
nil, // af |
|
aztrLower, // az |
|
nil, // el |
|
ltLower, // lt |
|
nil, // nl |
|
aztrLower, // tr |
|
} |
|
|
|
titleInfos = []struct { |
|
title mapFunc |
|
lower mapFunc |
|
titleSpan spanFunc |
|
rewrite func(*context) |
|
}{ |
|
{title, lower, isTitle, nil}, // und |
|
{title, lower, isTitle, afnlRewrite}, // af |
|
{aztrUpper(title), aztrLower, isTitle, nil}, // az |
|
{title, lower, isTitle, nil}, // el |
|
{ltUpper(title), ltLower, noSpan, nil}, // lt |
|
{nlTitle, lower, nlTitleSpan, afnlRewrite}, // nl |
|
{aztrUpper(title), aztrLower, isTitle, nil}, // tr |
|
} |
|
) |
|
|
|
func makeUpper(t language.Tag, o options) transform.SpanningTransformer { |
|
_, i, _ := matcher.Match(t) |
|
f := upperFunc[i].upper |
|
if f == nil { |
|
return undUpper |
|
} |
|
return &simpleCaser{f: f, span: upperFunc[i].span} |
|
} |
|
|
|
func makeLower(t language.Tag, o options) transform.SpanningTransformer { |
|
_, i, _ := matcher.Match(t) |
|
f := lowerFunc[i] |
|
if f == nil { |
|
if o.ignoreFinalSigma { |
|
return undLowerIgnoreSigma |
|
} |
|
return undLower |
|
} |
|
if o.ignoreFinalSigma { |
|
return &simpleCaser{f: f, span: isLower} |
|
} |
|
return &lowerCaser{ |
|
first: f, |
|
midWord: finalSigma(f), |
|
} |
|
} |
|
|
|
func makeTitle(t language.Tag, o options) transform.SpanningTransformer { |
|
_, i, _ := matcher.Match(t) |
|
x := &titleInfos[i] |
|
lower := x.lower |
|
if o.noLower { |
|
lower = (*context).copy |
|
} else if !o.ignoreFinalSigma { |
|
lower = finalSigma(lower) |
|
} |
|
return &titleCaser{ |
|
title: x.title, |
|
lower: lower, |
|
titleSpan: x.titleSpan, |
|
rewrite: x.rewrite, |
|
} |
|
} |
|
|
|
func noSpan(c *context) bool { |
|
c.err = transform.ErrEndOfSpan |
|
return false |
|
} |
|
|
|
// TODO: consider a similar special case for the fast majority lower case. This |
|
// is a bit more involved so will require some more precise benchmarking to |
|
// justify it. |
|
|
|
type undUpperCaser struct{ transform.NopResetter } |
|
|
|
// undUpperCaser implements the Transformer interface for doing an upper case |
|
// mapping for the root locale (und). It eliminates the need for an allocation |
|
// as it prevents escaping by not using function pointers. |
|
func (t undUpperCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
|
c := context{dst: dst, src: src, atEOF: atEOF} |
|
for c.next() { |
|
upper(&c) |
|
c.checkpoint() |
|
} |
|
return c.ret() |
|
} |
|
|
|
func (t undUpperCaser) Span(src []byte, atEOF bool) (n int, err error) { |
|
c := context{src: src, atEOF: atEOF} |
|
for c.next() && isUpper(&c) { |
|
c.checkpoint() |
|
} |
|
return c.retSpan() |
|
} |
|
|
|
// undLowerIgnoreSigmaCaser implements the Transformer interface for doing |
|
// a lower case mapping for the root locale (und) ignoring final sigma |
|
// handling. This casing algorithm is used in some performance-critical packages |
|
// like secure/precis and x/net/http/idna, which warrants its special-casing. |
|
type undLowerIgnoreSigmaCaser struct{ transform.NopResetter } |
|
|
|
func (t undLowerIgnoreSigmaCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
|
c := context{dst: dst, src: src, atEOF: atEOF} |
|
for c.next() && lower(&c) { |
|
c.checkpoint() |
|
} |
|
return c.ret() |
|
|
|
} |
|
|
|
// Span implements a generic lower-casing. This is possible as isLower works |
|
// for all lowercasing variants. All lowercase variants only vary in how they |
|
// transform a non-lowercase letter. They will never change an already lowercase |
|
// letter. In addition, there is no state. |
|
func (t undLowerIgnoreSigmaCaser) Span(src []byte, atEOF bool) (n int, err error) { |
|
c := context{src: src, atEOF: atEOF} |
|
for c.next() && isLower(&c) { |
|
c.checkpoint() |
|
} |
|
return c.retSpan() |
|
} |
|
|
|
type simpleCaser struct { |
|
context |
|
f mapFunc |
|
span spanFunc |
|
} |
|
|
|
// simpleCaser implements the Transformer interface for doing a case operation |
|
// on a rune-by-rune basis. |
|
func (t *simpleCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
|
c := context{dst: dst, src: src, atEOF: atEOF} |
|
for c.next() && t.f(&c) { |
|
c.checkpoint() |
|
} |
|
return c.ret() |
|
} |
|
|
|
func (t *simpleCaser) Span(src []byte, atEOF bool) (n int, err error) { |
|
c := context{src: src, atEOF: atEOF} |
|
for c.next() && t.span(&c) { |
|
c.checkpoint() |
|
} |
|
return c.retSpan() |
|
} |
|
|
|
// undLowerCaser implements the Transformer interface for doing a lower case |
|
// mapping for the root locale (und) ignoring final sigma handling. This casing |
|
// algorithm is used in some performance-critical packages like secure/precis |
|
// and x/net/http/idna, which warrants its special-casing. |
|
type undLowerCaser struct{ transform.NopResetter } |
|
|
|
func (t undLowerCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
|
c := context{dst: dst, src: src, atEOF: atEOF} |
|
|
|
for isInterWord := true; c.next(); { |
|
if isInterWord { |
|
if c.info.isCased() { |
|
if !lower(&c) { |
|
break |
|
} |
|
isInterWord = false |
|
} else if !c.copy() { |
|
break |
|
} |
|
} else { |
|
if c.info.isNotCasedAndNotCaseIgnorable() { |
|
if !c.copy() { |
|
break |
|
} |
|
isInterWord = true |
|
} else if !c.hasPrefix("Σ") { |
|
if !lower(&c) { |
|
break |
|
} |
|
} else if !finalSigmaBody(&c) { |
|
break |
|
} |
|
} |
|
c.checkpoint() |
|
} |
|
return c.ret() |
|
} |
|
|
|
func (t undLowerCaser) Span(src []byte, atEOF bool) (n int, err error) { |
|
c := context{src: src, atEOF: atEOF} |
|
for c.next() && isLower(&c) { |
|
c.checkpoint() |
|
} |
|
return c.retSpan() |
|
} |
|
|
|
// lowerCaser implements the Transformer interface. The default Unicode lower |
|
// casing requires different treatment for the first and subsequent characters |
|
// of a word, most notably to handle the Greek final Sigma. |
|
type lowerCaser struct { |
|
undLowerIgnoreSigmaCaser |
|
|
|
context |
|
|
|
first, midWord mapFunc |
|
} |
|
|
|
func (t *lowerCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
|
t.context = context{dst: dst, src: src, atEOF: atEOF} |
|
c := &t.context |
|
|
|
for isInterWord := true; c.next(); { |
|
if isInterWord { |
|
if c.info.isCased() { |
|
if !t.first(c) { |
|
break |
|
} |
|
isInterWord = false |
|
} else if !c.copy() { |
|
break |
|
} |
|
} else { |
|
if c.info.isNotCasedAndNotCaseIgnorable() { |
|
if !c.copy() { |
|
break |
|
} |
|
isInterWord = true |
|
} else if !t.midWord(c) { |
|
break |
|
} |
|
} |
|
c.checkpoint() |
|
} |
|
return c.ret() |
|
} |
|
|
|
// titleCaser implements the Transformer interface. Title casing algorithms |
|
// distinguish between the first letter of a word and subsequent letters of the |
|
// same word. It uses state to avoid requiring a potentially infinite lookahead. |
|
type titleCaser struct { |
|
context |
|
|
|
// rune mappings used by the actual casing algorithms. |
|
title mapFunc |
|
lower mapFunc |
|
titleSpan spanFunc |
|
|
|
rewrite func(*context) |
|
} |
|
|
|
// Transform implements the standard Unicode title case algorithm as defined in |
|
// Chapter 3 of The Unicode Standard: |
|
// toTitlecase(X): Find the word boundaries in X according to Unicode Standard |
|
// Annex #29, "Unicode Text Segmentation." For each word boundary, find the |
|
// first cased character F following the word boundary. If F exists, map F to |
|
// Titlecase_Mapping(F); then map all characters C between F and the following |
|
// word boundary to Lowercase_Mapping(C). |
|
func (t *titleCaser) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
|
t.context = context{dst: dst, src: src, atEOF: atEOF, isMidWord: t.isMidWord} |
|
c := &t.context |
|
|
|
if !c.next() { |
|
return c.ret() |
|
} |
|
|
|
for { |
|
p := c.info |
|
if t.rewrite != nil { |
|
t.rewrite(c) |
|
} |
|
|
|
wasMid := p.isMid() |
|
// Break out of this loop on failure to ensure we do not modify the |
|
// state incorrectly. |
|
if p.isCased() { |
|
if !c.isMidWord { |
|
if !t.title(c) { |
|
break |
|
} |
|
c.isMidWord = true |
|
} else if !t.lower(c) { |
|
break |
|
} |
|
} else if !c.copy() { |
|
break |
|
} else if p.isBreak() { |
|
c.isMidWord = false |
|
} |
|
|
|
// As we save the state of the transformer, it is safe to call |
|
// checkpoint after any successful write. |
|
if !(c.isMidWord && wasMid) { |
|
c.checkpoint() |
|
} |
|
|
|
if !c.next() { |
|
break |
|
} |
|
if wasMid && c.info.isMid() { |
|
c.isMidWord = false |
|
} |
|
} |
|
return c.ret() |
|
} |
|
|
|
func (t *titleCaser) Span(src []byte, atEOF bool) (n int, err error) { |
|
t.context = context{src: src, atEOF: atEOF, isMidWord: t.isMidWord} |
|
c := &t.context |
|
|
|
if !c.next() { |
|
return c.retSpan() |
|
} |
|
|
|
for { |
|
p := c.info |
|
if t.rewrite != nil { |
|
t.rewrite(c) |
|
} |
|
|
|
wasMid := p.isMid() |
|
// Break out of this loop on failure to ensure we do not modify the |
|
// state incorrectly. |
|
if p.isCased() { |
|
if !c.isMidWord { |
|
if !t.titleSpan(c) { |
|
break |
|
} |
|
c.isMidWord = true |
|
} else if !isLower(c) { |
|
break |
|
} |
|
} else if p.isBreak() { |
|
c.isMidWord = false |
|
} |
|
// As we save the state of the transformer, it is safe to call |
|
// checkpoint after any successful write. |
|
if !(c.isMidWord && wasMid) { |
|
c.checkpoint() |
|
} |
|
|
|
if !c.next() { |
|
break |
|
} |
|
if wasMid && c.info.isMid() { |
|
c.isMidWord = false |
|
} |
|
} |
|
return c.retSpan() |
|
} |
|
|
|
// finalSigma adds Greek final Sigma handing to another casing function. It |
|
// determines whether a lowercased sigma should be σ or ς, by looking ahead for |
|
// case-ignorables and a cased letters. |
|
func finalSigma(f mapFunc) mapFunc { |
|
return func(c *context) bool { |
|
if !c.hasPrefix("Σ") { |
|
return f(c) |
|
} |
|
return finalSigmaBody(c) |
|
} |
|
} |
|
|
|
func finalSigmaBody(c *context) bool { |
|
// Current rune must be ∑. |
|
|
|
// ::NFD(); |
|
// # 03A3; 03C2; 03A3; 03A3; Final_Sigma; # GREEK CAPITAL LETTER SIGMA |
|
// Σ } [:case-ignorable:]* [:cased:] → σ; |
|
// [:cased:] [:case-ignorable:]* { Σ → ς; |
|
// ::Any-Lower; |
|
// ::NFC(); |
|
|
|
p := c.pDst |
|
c.writeString("ς") |
|
|
|
// TODO: we should do this here, but right now this will never have an |
|
// effect as this is called when the prefix is Sigma, whereas Dutch and |
|
// Afrikaans only test for an apostrophe. |
|
// |
|
// if t.rewrite != nil { |
|
// t.rewrite(c) |
|
// } |
|
|
|
// We need to do one more iteration after maxIgnorable, as a cased |
|
// letter is not an ignorable and may modify the result. |
|
wasMid := false |
|
for i := 0; i < maxIgnorable+1; i++ { |
|
if !c.next() { |
|
return false |
|
} |
|
if !c.info.isCaseIgnorable() { |
|
// All Midword runes are also case ignorable, so we are |
|
// guaranteed to have a letter or word break here. As we are |
|
// unreading the run, there is no need to unset c.isMidWord; |
|
// the title caser will handle this. |
|
if c.info.isCased() { |
|
// p+1 is guaranteed to be in bounds: if writing ς was |
|
// successful, p+1 will contain the second byte of ς. If not, |
|
// this function will have returned after c.next returned false. |
|
c.dst[p+1]++ // ς → σ |
|
} |
|
c.unreadRune() |
|
return true |
|
} |
|
// A case ignorable may also introduce a word break, so we may need |
|
// to continue searching even after detecting a break. |
|
isMid := c.info.isMid() |
|
if (wasMid && isMid) || c.info.isBreak() { |
|
c.isMidWord = false |
|
} |
|
wasMid = isMid |
|
c.copy() |
|
} |
|
return true |
|
} |
|
|
|
// finalSigmaSpan would be the same as isLower. |
|
|
|
// elUpper implements Greek upper casing, which entails removing a predefined |
|
// set of non-blocked modifiers. Note that these accents should not be removed |
|
// for title casing! |
|
// Example: "Οδός" -> "ΟΔΟΣ". |
|
func elUpper(c *context) bool { |
|
// From CLDR: |
|
// [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Above:]]*? { [\u0313\u0314\u0301\u0300\u0306\u0342\u0308\u0304] → ; |
|
// [:Greek:] [^[:ccc=Not_Reordered:][:ccc=Iota_Subscript:]]*? { \u0345 → ; |
|
|
|
r, _ := utf8.DecodeRune(c.src[c.pSrc:]) |
|
oldPDst := c.pDst |
|
if !upper(c) { |
|
return false |
|
} |
|
if !unicode.Is(unicode.Greek, r) { |
|
return true |
|
} |
|
i := 0 |
|
// Take the properties of the uppercased rune that is already written to the |
|
// destination. This saves us the trouble of having to uppercase the |
|
// decomposed rune again. |
|
if b := norm.NFD.Properties(c.dst[oldPDst:]).Decomposition(); b != nil { |
|
// Restore the destination position and process the decomposed rune. |
|
r, sz := utf8.DecodeRune(b) |
|
if r <= 0xFF { // See A.6.1 |
|
return true |
|
} |
|
c.pDst = oldPDst |
|
// Insert the first rune and ignore the modifiers. See A.6.2. |
|
c.writeBytes(b[:sz]) |
|
i = len(b[sz:]) / 2 // Greek modifiers are always of length 2. |
|
} |
|
|
|
for ; i < maxIgnorable && c.next(); i++ { |
|
switch r, _ := utf8.DecodeRune(c.src[c.pSrc:]); r { |
|
// Above and Iota Subscript |
|
case 0x0300, // U+0300 COMBINING GRAVE ACCENT |
|
0x0301, // U+0301 COMBINING ACUTE ACCENT |
|
0x0304, // U+0304 COMBINING MACRON |
|
0x0306, // U+0306 COMBINING BREVE |
|
0x0308, // U+0308 COMBINING DIAERESIS |
|
0x0313, // U+0313 COMBINING COMMA ABOVE |
|
0x0314, // U+0314 COMBINING REVERSED COMMA ABOVE |
|
0x0342, // U+0342 COMBINING GREEK PERISPOMENI |
|
0x0345: // U+0345 COMBINING GREEK YPOGEGRAMMENI |
|
// No-op. Gobble the modifier. |
|
|
|
default: |
|
switch v, _ := trie.lookup(c.src[c.pSrc:]); info(v).cccType() { |
|
case cccZero: |
|
c.unreadRune() |
|
return true |
|
|
|
// We don't need to test for IotaSubscript as the only rune that |
|
// qualifies (U+0345) was already excluded in the switch statement |
|
// above. See A.4. |
|
|
|
case cccAbove: |
|
return c.copy() |
|
default: |
|
// Some other modifier. We're still allowed to gobble Greek |
|
// modifiers after this. |
|
c.copy() |
|
} |
|
} |
|
} |
|
return i == maxIgnorable |
|
} |
|
|
|
// TODO: implement elUpperSpan (low-priority: complex and infrequent). |
|
|
|
func ltLower(c *context) bool { |
|
// From CLDR: |
|
// # Introduce an explicit dot above when lowercasing capital I's and J's |
|
// # whenever there are more accents above. |
|
// # (of the accents used in Lithuanian: grave, acute, tilde above, and ogonek) |
|
// # 0049; 0069 0307; 0049; 0049; lt More_Above; # LATIN CAPITAL LETTER I |
|
// # 004A; 006A 0307; 004A; 004A; lt More_Above; # LATIN CAPITAL LETTER J |
|
// # 012E; 012F 0307; 012E; 012E; lt More_Above; # LATIN CAPITAL LETTER I WITH OGONEK |
|
// # 00CC; 0069 0307 0300; 00CC; 00CC; lt; # LATIN CAPITAL LETTER I WITH GRAVE |
|
// # 00CD; 0069 0307 0301; 00CD; 00CD; lt; # LATIN CAPITAL LETTER I WITH ACUTE |
|
// # 0128; 0069 0307 0303; 0128; 0128; lt; # LATIN CAPITAL LETTER I WITH TILDE |
|
// ::NFD(); |
|
// I } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → i \u0307; |
|
// J } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → j \u0307; |
|
// I \u0328 (Į) } [^[:ccc=Not_Reordered:][:ccc=Above:]]* [:ccc=Above:] → i \u0328 \u0307; |
|
// I \u0300 (Ì) → i \u0307 \u0300; |
|
// I \u0301 (Í) → i \u0307 \u0301; |
|
// I \u0303 (Ĩ) → i \u0307 \u0303; |
|
// ::Any-Lower(); |
|
// ::NFC(); |
|
|
|
i := 0 |
|
if r := c.src[c.pSrc]; r < utf8.RuneSelf { |
|
lower(c) |
|
if r != 'I' && r != 'J' { |
|
return true |
|
} |
|
} else { |
|
p := norm.NFD.Properties(c.src[c.pSrc:]) |
|
if d := p.Decomposition(); len(d) >= 3 && (d[0] == 'I' || d[0] == 'J') { |
|
// UTF-8 optimization: the decomposition will only have an above |
|
// modifier if the last rune of the decomposition is in [U+300-U+311]. |
|
// In all other cases, a decomposition starting with I is always |
|
// an I followed by modifiers that are not cased themselves. See A.2. |
|
if d[1] == 0xCC && d[2] <= 0x91 { // A.2.4. |
|
if !c.writeBytes(d[:1]) { |
|
return false |
|
} |
|
c.dst[c.pDst-1] += 'a' - 'A' // lower |
|
|
|
// Assumption: modifier never changes on lowercase. See A.1. |
|
// Assumption: all modifiers added have CCC = Above. See A.2.3. |
|
return c.writeString("\u0307") && c.writeBytes(d[1:]) |
|
} |
|
// In all other cases the additional modifiers will have a CCC |
|
// that is less than 230 (Above). We will insert the U+0307, if |
|
// needed, after these modifiers so that a string in FCD form |
|
// will remain so. See A.2.2. |
|
lower(c) |
|
i = 1 |
|
} else { |
|
return lower(c) |
|
} |
|
} |
|
|
|
for ; i < maxIgnorable && c.next(); i++ { |
|
switch c.info.cccType() { |
|
case cccZero: |
|
c.unreadRune() |
|
return true |
|
case cccAbove: |
|
return c.writeString("\u0307") && c.copy() // See A.1. |
|
default: |
|
c.copy() // See A.1. |
|
} |
|
} |
|
return i == maxIgnorable |
|
} |
|
|
|
// ltLowerSpan would be the same as isLower. |
|
|
|
func ltUpper(f mapFunc) mapFunc { |
|
return func(c *context) bool { |
|
// Unicode: |
|
// 0307; 0307; ; ; lt After_Soft_Dotted; # COMBINING DOT ABOVE |
|
// |
|
// From CLDR: |
|
// # Remove \u0307 following soft-dotteds (i, j, and the like), with possible |
|
// # intervening non-230 marks. |
|
// ::NFD(); |
|
// [:Soft_Dotted:] [^[:ccc=Not_Reordered:][:ccc=Above:]]* { \u0307 → ; |
|
// ::Any-Upper(); |
|
// ::NFC(); |
|
|
|
// TODO: See A.5. A soft-dotted rune never has an exception. This would |
|
// allow us to overload the exception bit and encode this property in |
|
// info. Need to measure performance impact of this. |
|
r, _ := utf8.DecodeRune(c.src[c.pSrc:]) |
|
oldPDst := c.pDst |
|
if !f(c) { |
|
return false |
|
} |
|
if !unicode.Is(unicode.Soft_Dotted, r) { |
|
return true |
|
} |
|
|
|
// We don't need to do an NFD normalization, as a soft-dotted rune never |
|
// contains U+0307. See A.3. |
|
|
|
i := 0 |
|
for ; i < maxIgnorable && c.next(); i++ { |
|
switch c.info.cccType() { |
|
case cccZero: |
|
c.unreadRune() |
|
return true |
|
case cccAbove: |
|
if c.hasPrefix("\u0307") { |
|
// We don't do a full NFC, but rather combine runes for |
|
// some of the common cases. (Returning NFC or |
|
// preserving normal form is neither a requirement nor |
|
// a possibility anyway). |
|
if !c.next() { |
|
return false |
|
} |
|
if c.dst[oldPDst] == 'I' && c.pDst == oldPDst+1 && c.src[c.pSrc] == 0xcc { |
|
s := "" |
|
switch c.src[c.pSrc+1] { |
|
case 0x80: // U+0300 COMBINING GRAVE ACCENT |
|
s = "\u00cc" // U+00CC LATIN CAPITAL LETTER I WITH GRAVE |
|
case 0x81: // U+0301 COMBINING ACUTE ACCENT |
|
s = "\u00cd" // U+00CD LATIN CAPITAL LETTER I WITH ACUTE |
|
case 0x83: // U+0303 COMBINING TILDE |
|
s = "\u0128" // U+0128 LATIN CAPITAL LETTER I WITH TILDE |
|
case 0x88: // U+0308 COMBINING DIAERESIS |
|
s = "\u00cf" // U+00CF LATIN CAPITAL LETTER I WITH DIAERESIS |
|
default: |
|
} |
|
if s != "" { |
|
c.pDst = oldPDst |
|
return c.writeString(s) |
|
} |
|
} |
|
} |
|
return c.copy() |
|
default: |
|
c.copy() |
|
} |
|
} |
|
return i == maxIgnorable |
|
} |
|
} |
|
|
|
// TODO: implement ltUpperSpan (low priority: complex and infrequent). |
|
|
|
func aztrUpper(f mapFunc) mapFunc { |
|
return func(c *context) bool { |
|
// i→İ; |
|
if c.src[c.pSrc] == 'i' { |
|
return c.writeString("İ") |
|
} |
|
return f(c) |
|
} |
|
} |
|
|
|
func aztrLower(c *context) (done bool) { |
|
// From CLDR: |
|
// # I and i-dotless; I-dot and i are case pairs in Turkish and Azeri |
|
// # 0130; 0069; 0130; 0130; tr; # LATIN CAPITAL LETTER I WITH DOT ABOVE |
|
// İ→i; |
|
// # When lowercasing, remove dot_above in the sequence I + dot_above, which will turn into i. |
|
// # This matches the behavior of the canonically equivalent I-dot_above |
|
// # 0307; ; 0307; 0307; tr After_I; # COMBINING DOT ABOVE |
|
// # When lowercasing, unless an I is before a dot_above, it turns into a dotless i. |
|
// # 0049; 0131; 0049; 0049; tr Not_Before_Dot; # LATIN CAPITAL LETTER I |
|
// I([^[:ccc=Not_Reordered:][:ccc=Above:]]*)\u0307 → i$1 ; |
|
// I→ı ; |
|
// ::Any-Lower(); |
|
if c.hasPrefix("\u0130") { // İ |
|
return c.writeString("i") |
|
} |
|
if c.src[c.pSrc] != 'I' { |
|
return lower(c) |
|
} |
|
|
|
// We ignore the lower-case I for now, but insert it later when we know |
|
// which form we need. |
|
start := c.pSrc + c.sz |
|
|
|
i := 0 |
|
Loop: |
|
// We check for up to n ignorables before \u0307. As \u0307 is an |
|
// ignorable as well, n is maxIgnorable-1. |
|
for ; i < maxIgnorable && c.next(); i++ { |
|
switch c.info.cccType() { |
|
case cccAbove: |
|
if c.hasPrefix("\u0307") { |
|
return c.writeString("i") && c.writeBytes(c.src[start:c.pSrc]) // ignore U+0307 |
|
} |
|
done = true |
|
break Loop |
|
case cccZero: |
|
c.unreadRune() |
|
done = true |
|
break Loop |
|
default: |
|
// We'll write this rune after we know which starter to use. |
|
} |
|
} |
|
if i == maxIgnorable { |
|
done = true |
|
} |
|
return c.writeString("ı") && c.writeBytes(c.src[start:c.pSrc+c.sz]) && done |
|
} |
|
|
|
// aztrLowerSpan would be the same as isLower. |
|
|
|
func nlTitle(c *context) bool { |
|
// From CLDR: |
|
// # Special titlecasing for Dutch initial "ij". |
|
// ::Any-Title(); |
|
// # Fix up Ij at the beginning of a "word" (per Any-Title, notUAX #29) |
|
// [:^WB=ALetter:] [:WB=Extend:]* [[:WB=MidLetter:][:WB=MidNumLet:]]? { Ij } → IJ ; |
|
if c.src[c.pSrc] != 'I' && c.src[c.pSrc] != 'i' { |
|
return title(c) |
|
} |
|
|
|
if !c.writeString("I") || !c.next() { |
|
return false |
|
} |
|
if c.src[c.pSrc] == 'j' || c.src[c.pSrc] == 'J' { |
|
return c.writeString("J") |
|
} |
|
c.unreadRune() |
|
return true |
|
} |
|
|
|
func nlTitleSpan(c *context) bool { |
|
// From CLDR: |
|
// # Special titlecasing for Dutch initial "ij". |
|
// ::Any-Title(); |
|
// # Fix up Ij at the beginning of a "word" (per Any-Title, notUAX #29) |
|
// [:^WB=ALetter:] [:WB=Extend:]* [[:WB=MidLetter:][:WB=MidNumLet:]]? { Ij } → IJ ; |
|
if c.src[c.pSrc] != 'I' { |
|
return isTitle(c) |
|
} |
|
if !c.next() || c.src[c.pSrc] == 'j' { |
|
return false |
|
} |
|
if c.src[c.pSrc] != 'J' { |
|
c.unreadRune() |
|
} |
|
return true |
|
} |
|
|
|
// Not part of CLDR, but see https://unicode.org/cldr/trac/ticket/7078. |
|
func afnlRewrite(c *context) { |
|
if c.hasPrefix("'") || c.hasPrefix("’") { |
|
c.isMidWord = true |
|
} |
|
}
|
|
|