You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
355 lines
8.5 KiB
355 lines
8.5 KiB
// Copyright 2014 The Go Authors. All rights reserved. |
|
// Use of this source code is governed by a BSD-style |
|
// license that can be found in the LICENSE file. |
|
|
|
// Package runes provide transforms for UTF-8 encoded text. |
|
package runes // import "golang.org/x/text/runes" |
|
|
|
import ( |
|
"unicode" |
|
"unicode/utf8" |
|
|
|
"golang.org/x/text/transform" |
|
) |
|
|
|
// A Set is a collection of runes. |
|
type Set interface { |
|
// Contains returns true if r is contained in the set. |
|
Contains(r rune) bool |
|
} |
|
|
|
type setFunc func(rune) bool |
|
|
|
func (s setFunc) Contains(r rune) bool { |
|
return s(r) |
|
} |
|
|
|
// Note: using funcs here instead of wrapping types result in cleaner |
|
// documentation and a smaller API. |
|
|
|
// In creates a Set with a Contains method that returns true for all runes in |
|
// the given RangeTable. |
|
func In(rt *unicode.RangeTable) Set { |
|
return setFunc(func(r rune) bool { return unicode.Is(rt, r) }) |
|
} |
|
|
|
// In creates a Set with a Contains method that returns true for all runes not |
|
// in the given RangeTable. |
|
func NotIn(rt *unicode.RangeTable) Set { |
|
return setFunc(func(r rune) bool { return !unicode.Is(rt, r) }) |
|
} |
|
|
|
// Predicate creates a Set with a Contains method that returns f(r). |
|
func Predicate(f func(rune) bool) Set { |
|
return setFunc(f) |
|
} |
|
|
|
// Transformer implements the transform.Transformer interface. |
|
type Transformer struct { |
|
t transform.SpanningTransformer |
|
} |
|
|
|
func (t Transformer) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
|
return t.t.Transform(dst, src, atEOF) |
|
} |
|
|
|
func (t Transformer) Span(b []byte, atEOF bool) (n int, err error) { |
|
return t.t.Span(b, atEOF) |
|
} |
|
|
|
func (t Transformer) Reset() { t.t.Reset() } |
|
|
|
// Bytes returns a new byte slice with the result of converting b using t. It |
|
// calls Reset on t. It returns nil if any error was found. This can only happen |
|
// if an error-producing Transformer is passed to If. |
|
func (t Transformer) Bytes(b []byte) []byte { |
|
b, _, err := transform.Bytes(t, b) |
|
if err != nil { |
|
return nil |
|
} |
|
return b |
|
} |
|
|
|
// String returns a string with the result of converting s using t. It calls |
|
// Reset on t. It returns the empty string if any error was found. This can only |
|
// happen if an error-producing Transformer is passed to If. |
|
func (t Transformer) String(s string) string { |
|
s, _, err := transform.String(t, s) |
|
if err != nil { |
|
return "" |
|
} |
|
return s |
|
} |
|
|
|
// TODO: |
|
// - Copy: copying strings and bytes in whole-rune units. |
|
// - Validation (maybe) |
|
// - Well-formed-ness (maybe) |
|
|
|
const runeErrorString = string(utf8.RuneError) |
|
|
|
// Remove returns a Transformer that removes runes r for which s.Contains(r). |
|
// Illegal input bytes are replaced by RuneError before being passed to f. |
|
func Remove(s Set) Transformer { |
|
if f, ok := s.(setFunc); ok { |
|
// This little trick cuts the running time of BenchmarkRemove for sets |
|
// created by Predicate roughly in half. |
|
// TODO: special-case RangeTables as well. |
|
return Transformer{remove(f)} |
|
} |
|
return Transformer{remove(s.Contains)} |
|
} |
|
|
|
// TODO: remove transform.RemoveFunc. |
|
|
|
type remove func(r rune) bool |
|
|
|
func (remove) Reset() {} |
|
|
|
// Span implements transform.Spanner. |
|
func (t remove) Span(src []byte, atEOF bool) (n int, err error) { |
|
for r, size := rune(0), 0; n < len(src); { |
|
if r = rune(src[n]); r < utf8.RuneSelf { |
|
size = 1 |
|
} else if r, size = utf8.DecodeRune(src[n:]); size == 1 { |
|
// Invalid rune. |
|
if !atEOF && !utf8.FullRune(src[n:]) { |
|
err = transform.ErrShortSrc |
|
} else { |
|
err = transform.ErrEndOfSpan |
|
} |
|
break |
|
} |
|
if t(r) { |
|
err = transform.ErrEndOfSpan |
|
break |
|
} |
|
n += size |
|
} |
|
return |
|
} |
|
|
|
// Transform implements transform.Transformer. |
|
func (t remove) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
|
for r, size := rune(0), 0; nSrc < len(src); { |
|
if r = rune(src[nSrc]); r < utf8.RuneSelf { |
|
size = 1 |
|
} else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 { |
|
// Invalid rune. |
|
if !atEOF && !utf8.FullRune(src[nSrc:]) { |
|
err = transform.ErrShortSrc |
|
break |
|
} |
|
// We replace illegal bytes with RuneError. Not doing so might |
|
// otherwise turn a sequence of invalid UTF-8 into valid UTF-8. |
|
// The resulting byte sequence may subsequently contain runes |
|
// for which t(r) is true that were passed unnoticed. |
|
if !t(utf8.RuneError) { |
|
if nDst+3 > len(dst) { |
|
err = transform.ErrShortDst |
|
break |
|
} |
|
dst[nDst+0] = runeErrorString[0] |
|
dst[nDst+1] = runeErrorString[1] |
|
dst[nDst+2] = runeErrorString[2] |
|
nDst += 3 |
|
} |
|
nSrc++ |
|
continue |
|
} |
|
if t(r) { |
|
nSrc += size |
|
continue |
|
} |
|
if nDst+size > len(dst) { |
|
err = transform.ErrShortDst |
|
break |
|
} |
|
for i := 0; i < size; i++ { |
|
dst[nDst] = src[nSrc] |
|
nDst++ |
|
nSrc++ |
|
} |
|
} |
|
return |
|
} |
|
|
|
// Map returns a Transformer that maps the runes in the input using the given |
|
// mapping. Illegal bytes in the input are converted to utf8.RuneError before |
|
// being passed to the mapping func. |
|
func Map(mapping func(rune) rune) Transformer { |
|
return Transformer{mapper(mapping)} |
|
} |
|
|
|
type mapper func(rune) rune |
|
|
|
func (mapper) Reset() {} |
|
|
|
// Span implements transform.Spanner. |
|
func (t mapper) Span(src []byte, atEOF bool) (n int, err error) { |
|
for r, size := rune(0), 0; n < len(src); n += size { |
|
if r = rune(src[n]); r < utf8.RuneSelf { |
|
size = 1 |
|
} else if r, size = utf8.DecodeRune(src[n:]); size == 1 { |
|
// Invalid rune. |
|
if !atEOF && !utf8.FullRune(src[n:]) { |
|
err = transform.ErrShortSrc |
|
} else { |
|
err = transform.ErrEndOfSpan |
|
} |
|
break |
|
} |
|
if t(r) != r { |
|
err = transform.ErrEndOfSpan |
|
break |
|
} |
|
} |
|
return n, err |
|
} |
|
|
|
// Transform implements transform.Transformer. |
|
func (t mapper) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
|
var replacement rune |
|
var b [utf8.UTFMax]byte |
|
|
|
for r, size := rune(0), 0; nSrc < len(src); { |
|
if r = rune(src[nSrc]); r < utf8.RuneSelf { |
|
if replacement = t(r); replacement < utf8.RuneSelf { |
|
if nDst == len(dst) { |
|
err = transform.ErrShortDst |
|
break |
|
} |
|
dst[nDst] = byte(replacement) |
|
nDst++ |
|
nSrc++ |
|
continue |
|
} |
|
size = 1 |
|
} else if r, size = utf8.DecodeRune(src[nSrc:]); size == 1 { |
|
// Invalid rune. |
|
if !atEOF && !utf8.FullRune(src[nSrc:]) { |
|
err = transform.ErrShortSrc |
|
break |
|
} |
|
|
|
if replacement = t(utf8.RuneError); replacement == utf8.RuneError { |
|
if nDst+3 > len(dst) { |
|
err = transform.ErrShortDst |
|
break |
|
} |
|
dst[nDst+0] = runeErrorString[0] |
|
dst[nDst+1] = runeErrorString[1] |
|
dst[nDst+2] = runeErrorString[2] |
|
nDst += 3 |
|
nSrc++ |
|
continue |
|
} |
|
} else if replacement = t(r); replacement == r { |
|
if nDst+size > len(dst) { |
|
err = transform.ErrShortDst |
|
break |
|
} |
|
for i := 0; i < size; i++ { |
|
dst[nDst] = src[nSrc] |
|
nDst++ |
|
nSrc++ |
|
} |
|
continue |
|
} |
|
|
|
n := utf8.EncodeRune(b[:], replacement) |
|
|
|
if nDst+n > len(dst) { |
|
err = transform.ErrShortDst |
|
break |
|
} |
|
for i := 0; i < n; i++ { |
|
dst[nDst] = b[i] |
|
nDst++ |
|
} |
|
nSrc += size |
|
} |
|
return |
|
} |
|
|
|
// ReplaceIllFormed returns a transformer that replaces all input bytes that are |
|
// not part of a well-formed UTF-8 code sequence with utf8.RuneError. |
|
func ReplaceIllFormed() Transformer { |
|
return Transformer{&replaceIllFormed{}} |
|
} |
|
|
|
type replaceIllFormed struct{ transform.NopResetter } |
|
|
|
func (t replaceIllFormed) Span(src []byte, atEOF bool) (n int, err error) { |
|
for n < len(src) { |
|
// ASCII fast path. |
|
if src[n] < utf8.RuneSelf { |
|
n++ |
|
continue |
|
} |
|
|
|
r, size := utf8.DecodeRune(src[n:]) |
|
|
|
// Look for a valid non-ASCII rune. |
|
if r != utf8.RuneError || size != 1 { |
|
n += size |
|
continue |
|
} |
|
|
|
// Look for short source data. |
|
if !atEOF && !utf8.FullRune(src[n:]) { |
|
err = transform.ErrShortSrc |
|
break |
|
} |
|
|
|
// We have an invalid rune. |
|
err = transform.ErrEndOfSpan |
|
break |
|
} |
|
return n, err |
|
} |
|
|
|
func (t replaceIllFormed) Transform(dst, src []byte, atEOF bool) (nDst, nSrc int, err error) { |
|
for nSrc < len(src) { |
|
// ASCII fast path. |
|
if r := src[nSrc]; r < utf8.RuneSelf { |
|
if nDst == len(dst) { |
|
err = transform.ErrShortDst |
|
break |
|
} |
|
dst[nDst] = r |
|
nDst++ |
|
nSrc++ |
|
continue |
|
} |
|
|
|
// Look for a valid non-ASCII rune. |
|
if _, size := utf8.DecodeRune(src[nSrc:]); size != 1 { |
|
if size != copy(dst[nDst:], src[nSrc:nSrc+size]) { |
|
err = transform.ErrShortDst |
|
break |
|
} |
|
nDst += size |
|
nSrc += size |
|
continue |
|
} |
|
|
|
// Look for short source data. |
|
if !atEOF && !utf8.FullRune(src[nSrc:]) { |
|
err = transform.ErrShortSrc |
|
break |
|
} |
|
|
|
// We have an invalid rune. |
|
if nDst+3 > len(dst) { |
|
err = transform.ErrShortDst |
|
break |
|
} |
|
dst[nDst+0] = runeErrorString[0] |
|
dst[nDst+1] = runeErrorString[1] |
|
dst[nDst+2] = runeErrorString[2] |
|
nDst += 3 |
|
nSrc++ |
|
} |
|
return nDst, nSrc, err |
|
}
|
|
|