You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
139 lines
3.6 KiB
139 lines
3.6 KiB
// Copyright 2016 The Go Authors. All rights reserved. |
|
// Use of this source code is governed by a BSD-style |
|
// license that can be found in the LICENSE file. |
|
|
|
package precis |
|
|
|
import "errors" |
|
|
|
// This file contains tables and code related to context rules. |
|
|
|
type catBitmap uint16 |
|
|
|
const ( |
|
// These bits, once set depending on the current value, are never unset. |
|
bJapanese catBitmap = 1 << iota |
|
bArabicIndicDigit |
|
bExtendedArabicIndicDigit |
|
|
|
// These bits are set on each iteration depending on the current value. |
|
bJoinStart |
|
bJoinMid |
|
bJoinEnd |
|
bVirama |
|
bLatinSmallL |
|
bGreek |
|
bHebrew |
|
|
|
// These bits indicated which of the permanent bits need to be set at the |
|
// end of the checks. |
|
bMustHaveJapn |
|
|
|
permanent = bJapanese | bArabicIndicDigit | bExtendedArabicIndicDigit | bMustHaveJapn |
|
) |
|
|
|
const finalShift = 10 |
|
|
|
var errContext = errors.New("precis: contextual rule violated") |
|
|
|
func init() { |
|
// Programmatically set these required bits as, manually setting them seems |
|
// too error prone. |
|
for i, ct := range categoryTransitions { |
|
categoryTransitions[i].keep |= permanent |
|
categoryTransitions[i].accept |= ct.term |
|
} |
|
} |
|
|
|
var categoryTransitions = []struct { |
|
keep catBitmap // mask selecting which bits to keep from the previous state |
|
set catBitmap // mask for which bits to set for this transition |
|
|
|
// These bitmaps are used for rules that require lookahead. |
|
// term&accept == term must be true, which is enforced programmatically. |
|
term catBitmap // bits accepted as termination condition |
|
accept catBitmap // bits that pass, but not sufficient as termination |
|
|
|
// The rule function cannot take a *context as an argument, as it would |
|
// cause the context to escape, adding significant overhead. |
|
rule func(beforeBits catBitmap) (doLookahead bool, err error) |
|
}{ |
|
joiningL: {set: bJoinStart}, |
|
joiningD: {set: bJoinStart | bJoinEnd}, |
|
joiningT: {keep: bJoinStart, set: bJoinMid}, |
|
joiningR: {set: bJoinEnd}, |
|
viramaModifier: {set: bVirama}, |
|
viramaJoinT: {set: bVirama | bJoinMid}, |
|
latinSmallL: {set: bLatinSmallL}, |
|
greek: {set: bGreek}, |
|
greekJoinT: {set: bGreek | bJoinMid}, |
|
hebrew: {set: bHebrew}, |
|
hebrewJoinT: {set: bHebrew | bJoinMid}, |
|
japanese: {set: bJapanese}, |
|
katakanaMiddleDot: {set: bMustHaveJapn}, |
|
|
|
zeroWidthNonJoiner: { |
|
term: bJoinEnd, |
|
accept: bJoinMid, |
|
rule: func(before catBitmap) (doLookAhead bool, err error) { |
|
if before&bVirama != 0 { |
|
return false, nil |
|
} |
|
if before&bJoinStart == 0 { |
|
return false, errContext |
|
} |
|
return true, nil |
|
}, |
|
}, |
|
zeroWidthJoiner: { |
|
rule: func(before catBitmap) (doLookAhead bool, err error) { |
|
if before&bVirama == 0 { |
|
err = errContext |
|
} |
|
return false, err |
|
}, |
|
}, |
|
middleDot: { |
|
term: bLatinSmallL, |
|
rule: func(before catBitmap) (doLookAhead bool, err error) { |
|
if before&bLatinSmallL == 0 { |
|
return false, errContext |
|
} |
|
return true, nil |
|
}, |
|
}, |
|
greekLowerNumeralSign: { |
|
set: bGreek, |
|
term: bGreek, |
|
rule: func(before catBitmap) (doLookAhead bool, err error) { |
|
return true, nil |
|
}, |
|
}, |
|
hebrewPreceding: { |
|
set: bHebrew, |
|
rule: func(before catBitmap) (doLookAhead bool, err error) { |
|
if before&bHebrew == 0 { |
|
err = errContext |
|
} |
|
return false, err |
|
}, |
|
}, |
|
arabicIndicDigit: { |
|
set: bArabicIndicDigit, |
|
rule: func(before catBitmap) (doLookAhead bool, err error) { |
|
if before&bExtendedArabicIndicDigit != 0 { |
|
err = errContext |
|
} |
|
return false, err |
|
}, |
|
}, |
|
extendedArabicIndicDigit: { |
|
set: bExtendedArabicIndicDigit, |
|
rule: func(before catBitmap) (doLookAhead bool, err error) { |
|
if before&bArabicIndicDigit != 0 { |
|
err = errContext |
|
} |
|
return false, err |
|
}, |
|
}, |
|
}
|
|
|