You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
206 lines
6.5 KiB
206 lines
6.5 KiB
// Copyright 2015 The Go Authors. All rights reserved. |
|
// Use of this source code is governed by a BSD-style |
|
// license that can be found in the LICENSE file. |
|
|
|
//go:generate stringer -type=Kind |
|
//go:generate go run gen.go gen_common.go gen_trieval.go |
|
|
|
// Package width provides functionality for handling different widths in text. |
|
// |
|
// Wide characters behave like ideographs; they tend to allow line breaks after |
|
// each character and remain upright in vertical text layout. Narrow characters |
|
// are kept together in words or runs that are rotated sideways in vertical text |
|
// layout. |
|
// |
|
// For more information, see https://unicode.org/reports/tr11/. |
|
package width // import "golang.org/x/text/width" |
|
|
|
import ( |
|
"unicode/utf8" |
|
|
|
"golang.org/x/text/transform" |
|
) |
|
|
|
// TODO |
|
// 1) Reduce table size by compressing blocks. |
|
// 2) API proposition for computing display length |
|
// (approximation, fixed pitch only). |
|
// 3) Implement display length. |
|
|
|
// Kind indicates the type of width property as defined in https://unicode.org/reports/tr11/. |
|
type Kind int |
|
|
|
const ( |
|
// Neutral characters do not occur in legacy East Asian character sets. |
|
Neutral Kind = iota |
|
|
|
// EastAsianAmbiguous characters that can be sometimes wide and sometimes |
|
// narrow and require additional information not contained in the character |
|
// code to further resolve their width. |
|
EastAsianAmbiguous |
|
|
|
// EastAsianWide characters are wide in its usual form. They occur only in |
|
// the context of East Asian typography. These runes may have explicit |
|
// halfwidth counterparts. |
|
EastAsianWide |
|
|
|
// EastAsianNarrow characters are narrow in its usual form. They often have |
|
// fullwidth counterparts. |
|
EastAsianNarrow |
|
|
|
// Note: there exist Narrow runes that do not have fullwidth or wide |
|
// counterparts, despite what the definition says (e.g. U+27E6). |
|
|
|
// EastAsianFullwidth characters have a compatibility decompositions of type |
|
// wide that map to a narrow counterpart. |
|
EastAsianFullwidth |
|
|
|
// EastAsianHalfwidth characters have a compatibility decomposition of type |
|
// narrow that map to a wide or ambiguous counterpart, plus U+20A9 ₩ WON |
|
// SIGN. |
|
EastAsianHalfwidth |
|
|
|
// Note: there exist runes that have a halfwidth counterparts but that are |
|
// classified as Ambiguous, rather than wide (e.g. U+2190). |
|
) |
|
|
|
// TODO: the generated tries need to return size 1 for invalid runes for the |
|
// width to be computed correctly (each byte should render width 1) |
|
|
|
var trie = newWidthTrie(0) |
|
|
|
// Lookup reports the Properties of the first rune in b and the number of bytes |
|
// of its UTF-8 encoding. |
|
func Lookup(b []byte) (p Properties, size int) { |
|
v, sz := trie.lookup(b) |
|
return Properties{elem(v), b[sz-1]}, sz |
|
} |
|
|
|
// LookupString reports the Properties of the first rune in s and the number of |
|
// bytes of its UTF-8 encoding. |
|
func LookupString(s string) (p Properties, size int) { |
|
v, sz := trie.lookupString(s) |
|
return Properties{elem(v), s[sz-1]}, sz |
|
} |
|
|
|
// LookupRune reports the Properties of rune r. |
|
func LookupRune(r rune) Properties { |
|
var buf [4]byte |
|
n := utf8.EncodeRune(buf[:], r) |
|
v, _ := trie.lookup(buf[:n]) |
|
last := byte(r) |
|
if r >= utf8.RuneSelf { |
|
last = 0x80 + byte(r&0x3f) |
|
} |
|
return Properties{elem(v), last} |
|
} |
|
|
|
// Properties provides access to width properties of a rune. |
|
type Properties struct { |
|
elem elem |
|
last byte |
|
} |
|
|
|
func (e elem) kind() Kind { |
|
return Kind(e >> typeShift) |
|
} |
|
|
|
// Kind returns the Kind of a rune as defined in Unicode TR #11. |
|
// See https://unicode.org/reports/tr11/ for more details. |
|
func (p Properties) Kind() Kind { |
|
return p.elem.kind() |
|
} |
|
|
|
// Folded returns the folded variant of a rune or 0 if the rune is canonical. |
|
func (p Properties) Folded() rune { |
|
if p.elem&tagNeedsFold != 0 { |
|
buf := inverseData[byte(p.elem)] |
|
buf[buf[0]] ^= p.last |
|
r, _ := utf8.DecodeRune(buf[1 : 1+buf[0]]) |
|
return r |
|
} |
|
return 0 |
|
} |
|
|
|
// Narrow returns the narrow variant of a rune or 0 if the rune is already |
|
// narrow or doesn't have a narrow variant. |
|
func (p Properties) Narrow() rune { |
|
if k := p.elem.kind(); byte(p.elem) != 0 && (k == EastAsianFullwidth || k == EastAsianWide || k == EastAsianAmbiguous) { |
|
buf := inverseData[byte(p.elem)] |
|
buf[buf[0]] ^= p.last |
|
r, _ := utf8.DecodeRune(buf[1 : 1+buf[0]]) |
|
return r |
|
} |
|
return 0 |
|
} |
|
|
|
// Wide returns the wide variant of a rune or 0 if the rune is already |
|
// wide or doesn't have a wide variant. |
|
func (p Properties) Wide() rune { |
|
if k := p.elem.kind(); byte(p.elem) != 0 && (k == EastAsianHalfwidth || k == EastAsianNarrow) { |
|
buf := inverseData[byte(p.elem)] |
|
buf[buf[0]] ^= p.last |
|
r, _ := utf8.DecodeRune(buf[1 : 1+buf[0]]) |
|
return r |
|
} |
|
return 0 |
|
} |
|
|
|
// TODO for Properties: |
|
// - Add Fullwidth/Halfwidth or Inverted methods for computing variants |
|
// mapping. |
|
// - Add width information (including information on non-spacing runes). |
|
|
|
// Transformer implements the transform.Transformer interface. |
|
type Transformer struct { |
|
t transform.SpanningTransformer |
|
} |
|
|
|
// Reset implements the transform.Transformer interface. |
|
func (t Transformer) Reset() { t.t.Reset() } |
|
|
|
// Transform implements the transform.Transformer interface. |
|
func (t Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
|
return t.t.Transform(dst, src, atEOF) |
|
} |
|
|
|
// Span implements the transform.SpanningTransformer interface. |
|
func (t Transformer) Span(src []byte, atEOF bool) (n int, err error) { |
|
return t.t.Span(src, atEOF) |
|
} |
|
|
|
// Bytes returns a new byte slice with the result of applying t to b. |
|
func (t Transformer) Bytes(b []byte) []byte { |
|
b, _, _ = transform.Bytes(t, b) |
|
return b |
|
} |
|
|
|
// String returns a string with the result of applying t to s. |
|
func (t Transformer) String(s string) string { |
|
s, _, _ = transform.String(t, s) |
|
return s |
|
} |
|
|
|
var ( |
|
// Fold is a transform that maps all runes to their canonical width. |
|
// |
|
// Note that the NFKC and NFKD transforms in golang.org/x/text/unicode/norm |
|
// provide a more generic folding mechanism. |
|
Fold Transformer = Transformer{foldTransform{}} |
|
|
|
// Widen is a transform that maps runes to their wide variant, if |
|
// available. |
|
Widen Transformer = Transformer{wideTransform{}} |
|
|
|
// Narrow is a transform that maps runes to their narrow variant, if |
|
// available. |
|
Narrow Transformer = Transformer{narrowTransform{}} |
|
) |
|
|
|
// TODO: Consider the following options: |
|
// - Treat Ambiguous runes that have a halfwidth counterpart as wide, or some |
|
// generalized variant of this. |
|
// - Consider a wide Won character to be the default width (or some generalized |
|
// variant of this). |
|
// - Filter the set of characters that gets converted (the preferred approach is |
|
// to allow applying filters to transforms).
|
|
|