You can not select more than 25 topics
Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
335 lines
11 KiB
335 lines
11 KiB
// Copyright 2015 The Go Authors. All rights reserved. |
|
// Use of this source code is governed by a BSD-style |
|
// license that can be found in the LICENSE file. |
|
|
|
package bidi |
|
|
|
import ( |
|
"container/list" |
|
"fmt" |
|
"sort" |
|
) |
|
|
|
// This file contains a port of the reference implementation of the |
|
// Bidi Parentheses Algorithm: |
|
// https://www.unicode.org/Public/PROGRAMS/BidiReferenceJava/BidiPBAReference.java |
|
// |
|
// The implementation in this file covers definitions BD14-BD16 and rule N0 |
|
// of UAX#9. |
|
// |
|
// Some preprocessing is done for each rune before data is passed to this |
|
// algorithm: |
|
// - opening and closing brackets are identified |
|
// - a bracket pair type, like '(' and ')' is assigned a unique identifier that |
|
// is identical for the opening and closing bracket. It is left to do these |
|
// mappings. |
|
// - The BPA algorithm requires that bracket characters that are canonical |
|
// equivalents of each other be able to be substituted for each other. |
|
// It is the responsibility of the caller to do this canonicalization. |
|
// |
|
// In implementing BD16, this implementation departs slightly from the "logical" |
|
// algorithm defined in UAX#9. In particular, the stack referenced there |
|
// supports operations that go beyond a "basic" stack. An equivalent |
|
// implementation based on a linked list is used here. |
|
|
|
// Bidi_Paired_Bracket_Type |
|
// BD14. An opening paired bracket is a character whose |
|
// Bidi_Paired_Bracket_Type property value is Open. |
|
// |
|
// BD15. A closing paired bracket is a character whose |
|
// Bidi_Paired_Bracket_Type property value is Close. |
|
type bracketType byte |
|
|
|
const ( |
|
bpNone bracketType = iota |
|
bpOpen |
|
bpClose |
|
) |
|
|
|
// bracketPair holds a pair of index values for opening and closing bracket |
|
// location of a bracket pair. |
|
type bracketPair struct { |
|
opener int |
|
closer int |
|
} |
|
|
|
func (b *bracketPair) String() string { |
|
return fmt.Sprintf("(%v, %v)", b.opener, b.closer) |
|
} |
|
|
|
// bracketPairs is a slice of bracketPairs with a sort.Interface implementation. |
|
type bracketPairs []bracketPair |
|
|
|
func (b bracketPairs) Len() int { return len(b) } |
|
func (b bracketPairs) Swap(i, j int) { b[i], b[j] = b[j], b[i] } |
|
func (b bracketPairs) Less(i, j int) bool { return b[i].opener < b[j].opener } |
|
|
|
// resolvePairedBrackets runs the paired bracket part of the UBA algorithm. |
|
// |
|
// For each rune, it takes the indexes into the original string, the class the |
|
// bracket type (in pairTypes) and the bracket identifier (pairValues). It also |
|
// takes the direction type for the start-of-sentence and the embedding level. |
|
// |
|
// The identifiers for bracket types are the rune of the canonicalized opening |
|
// bracket for brackets (open or close) or 0 for runes that are not brackets. |
|
func resolvePairedBrackets(s *isolatingRunSequence) { |
|
p := bracketPairer{ |
|
sos: s.sos, |
|
openers: list.New(), |
|
codesIsolatedRun: s.types, |
|
indexes: s.indexes, |
|
} |
|
dirEmbed := L |
|
if s.level&1 != 0 { |
|
dirEmbed = R |
|
} |
|
p.locateBrackets(s.p.pairTypes, s.p.pairValues) |
|
p.resolveBrackets(dirEmbed, s.p.initialTypes) |
|
} |
|
|
|
type bracketPairer struct { |
|
sos Class // direction corresponding to start of sequence |
|
|
|
// The following is a restatement of BD 16 using non-algorithmic language. |
|
// |
|
// A bracket pair is a pair of characters consisting of an opening |
|
// paired bracket and a closing paired bracket such that the |
|
// Bidi_Paired_Bracket property value of the former equals the latter, |
|
// subject to the following constraints. |
|
// - both characters of a pair occur in the same isolating run sequence |
|
// - the closing character of a pair follows the opening character |
|
// - any bracket character can belong at most to one pair, the earliest possible one |
|
// - any bracket character not part of a pair is treated like an ordinary character |
|
// - pairs may nest properly, but their spans may not overlap otherwise |
|
|
|
// Bracket characters with canonical decompositions are supposed to be |
|
// treated as if they had been normalized, to allow normalized and non- |
|
// normalized text to give the same result. In this implementation that step |
|
// is pushed out to the caller. The caller has to ensure that the pairValue |
|
// slices contain the rune of the opening bracket after normalization for |
|
// any opening or closing bracket. |
|
|
|
openers *list.List // list of positions for opening brackets |
|
|
|
// bracket pair positions sorted by location of opening bracket |
|
pairPositions bracketPairs |
|
|
|
codesIsolatedRun []Class // directional bidi codes for an isolated run |
|
indexes []int // array of index values into the original string |
|
|
|
} |
|
|
|
// matchOpener reports whether characters at given positions form a matching |
|
// bracket pair. |
|
func (p *bracketPairer) matchOpener(pairValues []rune, opener, closer int) bool { |
|
return pairValues[p.indexes[opener]] == pairValues[p.indexes[closer]] |
|
} |
|
|
|
const maxPairingDepth = 63 |
|
|
|
// locateBrackets locates matching bracket pairs according to BD16. |
|
// |
|
// This implementation uses a linked list instead of a stack, because, while |
|
// elements are added at the front (like a push) they are not generally removed |
|
// in atomic 'pop' operations, reducing the benefit of the stack archetype. |
|
func (p *bracketPairer) locateBrackets(pairTypes []bracketType, pairValues []rune) { |
|
// traverse the run |
|
// do that explicitly (not in a for-each) so we can record position |
|
for i, index := range p.indexes { |
|
|
|
// look at the bracket type for each character |
|
if pairTypes[index] == bpNone || p.codesIsolatedRun[i] != ON { |
|
// continue scanning |
|
continue |
|
} |
|
switch pairTypes[index] { |
|
case bpOpen: |
|
// check if maximum pairing depth reached |
|
if p.openers.Len() == maxPairingDepth { |
|
p.openers.Init() |
|
return |
|
} |
|
// remember opener location, most recent first |
|
p.openers.PushFront(i) |
|
|
|
case bpClose: |
|
// see if there is a match |
|
count := 0 |
|
for elem := p.openers.Front(); elem != nil; elem = elem.Next() { |
|
count++ |
|
opener := elem.Value.(int) |
|
if p.matchOpener(pairValues, opener, i) { |
|
// if the opener matches, add nested pair to the ordered list |
|
p.pairPositions = append(p.pairPositions, bracketPair{opener, i}) |
|
// remove up to and including matched opener |
|
for ; count > 0; count-- { |
|
p.openers.Remove(p.openers.Front()) |
|
} |
|
break |
|
} |
|
} |
|
sort.Sort(p.pairPositions) |
|
// if we get here, the closing bracket matched no openers |
|
// and gets ignored |
|
} |
|
} |
|
} |
|
|
|
// Bracket pairs within an isolating run sequence are processed as units so |
|
// that both the opening and the closing paired bracket in a pair resolve to |
|
// the same direction. |
|
// |
|
// N0. Process bracket pairs in an isolating run sequence sequentially in |
|
// the logical order of the text positions of the opening paired brackets |
|
// using the logic given below. Within this scope, bidirectional types EN |
|
// and AN are treated as R. |
|
// |
|
// Identify the bracket pairs in the current isolating run sequence |
|
// according to BD16. For each bracket-pair element in the list of pairs of |
|
// text positions: |
|
// |
|
// a Inspect the bidirectional types of the characters enclosed within the |
|
// bracket pair. |
|
// |
|
// b If any strong type (either L or R) matching the embedding direction is |
|
// found, set the type for both brackets in the pair to match the embedding |
|
// direction. |
|
// |
|
// o [ e ] o -> o e e e o |
|
// |
|
// o [ o e ] -> o e o e e |
|
// |
|
// o [ NI e ] -> o e NI e e |
|
// |
|
// c Otherwise, if a strong type (opposite the embedding direction) is |
|
// found, test for adjacent strong types as follows: 1 First, check |
|
// backwards before the opening paired bracket until the first strong type |
|
// (L, R, or sos) is found. If that first preceding strong type is opposite |
|
// the embedding direction, then set the type for both brackets in the pair |
|
// to that type. 2 Otherwise, set the type for both brackets in the pair to |
|
// the embedding direction. |
|
// |
|
// o [ o ] e -> o o o o e |
|
// |
|
// o [ o NI ] o -> o o o NI o o |
|
// |
|
// e [ o ] o -> e e o e o |
|
// |
|
// e [ o ] e -> e e o e e |
|
// |
|
// e ( o [ o ] NI ) e -> e e o o o o NI e e |
|
// |
|
// d Otherwise, do not set the type for the current bracket pair. Note that |
|
// if the enclosed text contains no strong types the paired brackets will |
|
// both resolve to the same level when resolved individually using rules N1 |
|
// and N2. |
|
// |
|
// e ( NI ) o -> e ( NI ) o |
|
|
|
// getStrongTypeN0 maps character's directional code to strong type as required |
|
// by rule N0. |
|
// |
|
// TODO: have separate type for "strong" directionality. |
|
func (p *bracketPairer) getStrongTypeN0(index int) Class { |
|
switch p.codesIsolatedRun[index] { |
|
// in the scope of N0, number types are treated as R |
|
case EN, AN, AL, R: |
|
return R |
|
case L: |
|
return L |
|
default: |
|
return ON |
|
} |
|
} |
|
|
|
// classifyPairContent reports the strong types contained inside a Bracket Pair, |
|
// assuming the given embedding direction. |
|
// |
|
// It returns ON if no strong type is found. If a single strong type is found, |
|
// it returns this type. Otherwise it returns the embedding direction. |
|
// |
|
// TODO: use separate type for "strong" directionality. |
|
func (p *bracketPairer) classifyPairContent(loc bracketPair, dirEmbed Class) Class { |
|
dirOpposite := ON |
|
for i := loc.opener + 1; i < loc.closer; i++ { |
|
dir := p.getStrongTypeN0(i) |
|
if dir == ON { |
|
continue |
|
} |
|
if dir == dirEmbed { |
|
return dir // type matching embedding direction found |
|
} |
|
dirOpposite = dir |
|
} |
|
// return ON if no strong type found, or class opposite to dirEmbed |
|
return dirOpposite |
|
} |
|
|
|
// classBeforePair determines which strong types are present before a Bracket |
|
// Pair. Return R or L if strong type found, otherwise ON. |
|
func (p *bracketPairer) classBeforePair(loc bracketPair) Class { |
|
for i := loc.opener - 1; i >= 0; i-- { |
|
if dir := p.getStrongTypeN0(i); dir != ON { |
|
return dir |
|
} |
|
} |
|
// no strong types found, return sos |
|
return p.sos |
|
} |
|
|
|
// assignBracketType implements rule N0 for a single bracket pair. |
|
func (p *bracketPairer) assignBracketType(loc bracketPair, dirEmbed Class, initialTypes []Class) { |
|
// rule "N0, a", inspect contents of pair |
|
dirPair := p.classifyPairContent(loc, dirEmbed) |
|
|
|
// dirPair is now L, R, or N (no strong type found) |
|
|
|
// the following logical tests are performed out of order compared to |
|
// the statement of the rules but yield the same results |
|
if dirPair == ON { |
|
return // case "d" - nothing to do |
|
} |
|
|
|
if dirPair != dirEmbed { |
|
// case "c": strong type found, opposite - check before (c.1) |
|
dirPair = p.classBeforePair(loc) |
|
if dirPair == dirEmbed || dirPair == ON { |
|
// no strong opposite type found before - use embedding (c.2) |
|
dirPair = dirEmbed |
|
} |
|
} |
|
// else: case "b", strong type found matching embedding, |
|
// no explicit action needed, as dirPair is already set to embedding |
|
// direction |
|
|
|
// set the bracket types to the type found |
|
p.setBracketsToType(loc, dirPair, initialTypes) |
|
} |
|
|
|
func (p *bracketPairer) setBracketsToType(loc bracketPair, dirPair Class, initialTypes []Class) { |
|
p.codesIsolatedRun[loc.opener] = dirPair |
|
p.codesIsolatedRun[loc.closer] = dirPair |
|
|
|
for i := loc.opener + 1; i < loc.closer; i++ { |
|
index := p.indexes[i] |
|
if initialTypes[index] != NSM { |
|
break |
|
} |
|
p.codesIsolatedRun[i] = dirPair |
|
} |
|
|
|
for i := loc.closer + 1; i < len(p.indexes); i++ { |
|
index := p.indexes[i] |
|
if initialTypes[index] != NSM { |
|
break |
|
} |
|
p.codesIsolatedRun[i] = dirPair |
|
} |
|
} |
|
|
|
// resolveBrackets implements rule N0 for a list of pairs. |
|
func (p *bracketPairer) resolveBrackets(dirEmbed Class, initialTypes []Class) { |
|
for _, loc := range p.pairPositions { |
|
p.assignBracketType(loc, dirEmbed, initialTypes) |
|
} |
|
}
|
|
|