You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
207 lines
6.5 KiB
207 lines
6.5 KiB
3 years ago
|
// Copyright 2015 The Go Authors. All rights reserved.
|
||
|
// Use of this source code is governed by a BSD-style
|
||
|
// license that can be found in the LICENSE file.
|
||
|
|
||
|
//go:generate stringer -type=Kind
|
||
|
//go:generate go run gen.go gen_common.go gen_trieval.go
|
||
|
|
||
|
// Package width provides functionality for handling different widths in text.
|
||
|
//
|
||
|
// Wide characters behave like ideographs; they tend to allow line breaks after
|
||
|
// each character and remain upright in vertical text layout. Narrow characters
|
||
|
// are kept together in words or runs that are rotated sideways in vertical text
|
||
|
// layout.
|
||
|
//
|
||
|
// For more information, see https://unicode.org/reports/tr11/.
|
||
|
package width // import "golang.org/x/text/width"
|
||
|
|
||
|
import (
|
||
|
"unicode/utf8"
|
||
|
|
||
|
"golang.org/x/text/transform"
|
||
|
)
|
||
|
|
||
|
// TODO
|
||
|
// 1) Reduce table size by compressing blocks.
|
||
|
// 2) API proposition for computing display length
|
||
|
// (approximation, fixed pitch only).
|
||
|
// 3) Implement display length.
|
||
|
|
||
|
// Kind indicates the type of width property as defined in https://unicode.org/reports/tr11/.
|
||
|
type Kind int
|
||
|
|
||
|
const (
|
||
|
// Neutral characters do not occur in legacy East Asian character sets.
|
||
|
Neutral Kind = iota
|
||
|
|
||
|
// EastAsianAmbiguous characters that can be sometimes wide and sometimes
|
||
|
// narrow and require additional information not contained in the character
|
||
|
// code to further resolve their width.
|
||
|
EastAsianAmbiguous
|
||
|
|
||
|
// EastAsianWide characters are wide in its usual form. They occur only in
|
||
|
// the context of East Asian typography. These runes may have explicit
|
||
|
// halfwidth counterparts.
|
||
|
EastAsianWide
|
||
|
|
||
|
// EastAsianNarrow characters are narrow in its usual form. They often have
|
||
|
// fullwidth counterparts.
|
||
|
EastAsianNarrow
|
||
|
|
||
|
// Note: there exist Narrow runes that do not have fullwidth or wide
|
||
|
// counterparts, despite what the definition says (e.g. U+27E6).
|
||
|
|
||
|
// EastAsianFullwidth characters have a compatibility decompositions of type
|
||
|
// wide that map to a narrow counterpart.
|
||
|
EastAsianFullwidth
|
||
|
|
||
|
// EastAsianHalfwidth characters have a compatibility decomposition of type
|
||
|
// narrow that map to a wide or ambiguous counterpart, plus U+20A9 ₩ WON
|
||
|
// SIGN.
|
||
|
EastAsianHalfwidth
|
||
|
|
||
|
// Note: there exist runes that have a halfwidth counterparts but that are
|
||
|
// classified as Ambiguous, rather than wide (e.g. U+2190).
|
||
|
)
|
||
|
|
||
|
// TODO: the generated tries need to return size 1 for invalid runes for the
|
||
|
// width to be computed correctly (each byte should render width 1)
|
||
|
|
||
|
var trie = newWidthTrie(0)
|
||
|
|
||
|
// Lookup reports the Properties of the first rune in b and the number of bytes
|
||
|
// of its UTF-8 encoding.
|
||
|
func Lookup(b []byte) (p Properties, size int) {
|
||
|
v, sz := trie.lookup(b)
|
||
|
return Properties{elem(v), b[sz-1]}, sz
|
||
|
}
|
||
|
|
||
|
// LookupString reports the Properties of the first rune in s and the number of
|
||
|
// bytes of its UTF-8 encoding.
|
||
|
func LookupString(s string) (p Properties, size int) {
|
||
|
v, sz := trie.lookupString(s)
|
||
|
return Properties{elem(v), s[sz-1]}, sz
|
||
|
}
|
||
|
|
||
|
// LookupRune reports the Properties of rune r.
|
||
|
func LookupRune(r rune) Properties {
|
||
|
var buf [4]byte
|
||
|
n := utf8.EncodeRune(buf[:], r)
|
||
|
v, _ := trie.lookup(buf[:n])
|
||
|
last := byte(r)
|
||
|
if r >= utf8.RuneSelf {
|
||
|
last = 0x80 + byte(r&0x3f)
|
||
|
}
|
||
|
return Properties{elem(v), last}
|
||
|
}
|
||
|
|
||
|
// Properties provides access to width properties of a rune.
|
||
|
type Properties struct {
|
||
|
elem elem
|
||
|
last byte
|
||
|
}
|
||
|
|
||
|
func (e elem) kind() Kind {
|
||
|
return Kind(e >> typeShift)
|
||
|
}
|
||
|
|
||
|
// Kind returns the Kind of a rune as defined in Unicode TR #11.
|
||
|
// See https://unicode.org/reports/tr11/ for more details.
|
||
|
func (p Properties) Kind() Kind {
|
||
|
return p.elem.kind()
|
||
|
}
|
||
|
|
||
|
// Folded returns the folded variant of a rune or 0 if the rune is canonical.
|
||
|
func (p Properties) Folded() rune {
|
||
|
if p.elem&tagNeedsFold != 0 {
|
||
|
buf := inverseData[byte(p.elem)]
|
||
|
buf[buf[0]] ^= p.last
|
||
|
r, _ := utf8.DecodeRune(buf[1 : 1+buf[0]])
|
||
|
return r
|
||
|
}
|
||
|
return 0
|
||
|
}
|
||
|
|
||
|
// Narrow returns the narrow variant of a rune or 0 if the rune is already
|
||
|
// narrow or doesn't have a narrow variant.
|
||
|
func (p Properties) Narrow() rune {
|
||
|
if k := p.elem.kind(); byte(p.elem) != 0 && (k == EastAsianFullwidth || k == EastAsianWide || k == EastAsianAmbiguous) {
|
||
|
buf := inverseData[byte(p.elem)]
|
||
|
buf[buf[0]] ^= p.last
|
||
|
r, _ := utf8.DecodeRune(buf[1 : 1+buf[0]])
|
||
|
return r
|
||
|
}
|
||
|
return 0
|
||
|
}
|
||
|
|
||
|
// Wide returns the wide variant of a rune or 0 if the rune is already
|
||
|
// wide or doesn't have a wide variant.
|
||
|
func (p Properties) Wide() rune {
|
||
|
if k := p.elem.kind(); byte(p.elem) != 0 && (k == EastAsianHalfwidth || k == EastAsianNarrow) {
|
||
|
buf := inverseData[byte(p.elem)]
|
||
|
buf[buf[0]] ^= p.last
|
||
|
r, _ := utf8.DecodeRune(buf[1 : 1+buf[0]])
|
||
|
return r
|
||
|
}
|
||
|
return 0
|
||
|
}
|
||
|
|
||
|
// TODO for Properties:
|
||
|
// - Add Fullwidth/Halfwidth or Inverted methods for computing variants
|
||
|
// mapping.
|
||
|
// - Add width information (including information on non-spacing runes).
|
||
|
|
||
|
// Transformer implements the transform.Transformer interface.
|
||
|
type Transformer struct {
|
||
|
t transform.SpanningTransformer
|
||
|
}
|
||
|
|
||
|
// Reset implements the transform.Transformer interface.
|
||
|
func (t Transformer) Reset() { t.t.Reset() }
|
||
|
|
||
|
// Transform implements the transform.Transformer interface.
|
||
|
func (t Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) {
|
||
|
return t.t.Transform(dst, src, atEOF)
|
||
|
}
|
||
|
|
||
|
// Span implements the transform.SpanningTransformer interface.
|
||
|
func (t Transformer) Span(src []byte, atEOF bool) (n int, err error) {
|
||
|
return t.t.Span(src, atEOF)
|
||
|
}
|
||
|
|
||
|
// Bytes returns a new byte slice with the result of applying t to b.
|
||
|
func (t Transformer) Bytes(b []byte) []byte {
|
||
|
b, _, _ = transform.Bytes(t, b)
|
||
|
return b
|
||
|
}
|
||
|
|
||
|
// String returns a string with the result of applying t to s.
|
||
|
func (t Transformer) String(s string) string {
|
||
|
s, _, _ = transform.String(t, s)
|
||
|
return s
|
||
|
}
|
||
|
|
||
|
var (
|
||
|
// Fold is a transform that maps all runes to their canonical width.
|
||
|
//
|
||
|
// Note that the NFKC and NFKD transforms in golang.org/x/text/unicode/norm
|
||
|
// provide a more generic folding mechanism.
|
||
|
Fold Transformer = Transformer{foldTransform{}}
|
||
|
|
||
|
// Widen is a transform that maps runes to their wide variant, if
|
||
|
// available.
|
||
|
Widen Transformer = Transformer{wideTransform{}}
|
||
|
|
||
|
// Narrow is a transform that maps runes to their narrow variant, if
|
||
|
// available.
|
||
|
Narrow Transformer = Transformer{narrowTransform{}}
|
||
|
)
|
||
|
|
||
|
// TODO: Consider the following options:
|
||
|
// - Treat Ambiguous runes that have a halfwidth counterpart as wide, or some
|
||
|
// generalized variant of this.
|
||
|
// - Consider a wide Won character to be the default width (or some generalized
|
||
|
// variant of this).
|
||
|
// - Filter the set of characters that gets converted (the preferred approach is
|
||
|
// to allow applying filters to transforms).
|