You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 

876 lines
21 KiB

package scanner
import (
"io"
"strings"
"github.com/goccy/go-yaml/token"
"golang.org/x/xerrors"
)
// IndentState state for indent
type IndentState int
const (
// IndentStateEqual equals previous indent
IndentStateEqual IndentState = iota
// IndentStateUp more indent than previous
IndentStateUp
// IndentStateDown less indent than previous
IndentStateDown
// IndentStateKeep uses not indent token
IndentStateKeep
)
// Scanner holds the scanner's internal state while processing a given text.
// It can be allocated as part of another data structure but must be initialized via Init before use.
type Scanner struct {
source []rune
sourcePos int
sourceSize int
line int
column int
offset int
prevIndentLevel int
prevIndentNum int
prevIndentColumn int
docStartColumn int
indentLevel int
indentNum int
isFirstCharAtLine bool
isAnchor bool
startedFlowSequenceNum int
startedFlowMapNum int
indentState IndentState
savedPos *token.Position
}
func (s *Scanner) pos() *token.Position {
return &token.Position{
Line: s.line,
Column: s.column,
Offset: s.offset,
IndentNum: s.indentNum,
IndentLevel: s.indentLevel,
}
}
func (s *Scanner) bufferedToken(ctx *Context) *token.Token {
if s.savedPos != nil {
tk := ctx.bufferedToken(s.savedPos)
s.savedPos = nil
return tk
}
size := len(ctx.buf)
return ctx.bufferedToken(&token.Position{
Line: s.line,
Column: s.column - size,
Offset: s.offset - size,
IndentNum: s.indentNum,
IndentLevel: s.indentLevel,
})
}
func (s *Scanner) progressColumn(ctx *Context, num int) {
s.column += num
s.offset += num
ctx.progress(num)
}
func (s *Scanner) progressLine(ctx *Context) {
s.column = 1
s.line++
s.offset++
s.indentNum = 0
s.isFirstCharAtLine = true
s.isAnchor = false
ctx.progress(1)
}
func (s *Scanner) isNeededKeepPreviousIndentNum(ctx *Context, c rune) bool {
if !s.isChangedToIndentStateUp() {
return false
}
if ctx.isDocument() {
return true
}
if c == '-' && ctx.existsBuffer() {
return true
}
return false
}
func (s *Scanner) isNewLineChar(c rune) bool {
if c == '\n' {
return true
}
if c == '\r' {
return true
}
return false
}
func (s *Scanner) newLineCount(src []rune) int {
size := len(src)
cnt := 0
for i := 0; i < size; i++ {
c := src[i]
switch c {
case '\r':
if i+1 < size && src[i+1] == '\n' {
i++
}
cnt++
case '\n':
cnt++
}
}
return cnt
}
func (s *Scanner) updateIndentState(ctx *Context) {
indentNumBasedIndentState := s.indentState
if s.prevIndentNum < s.indentNum {
s.indentLevel = s.prevIndentLevel + 1
indentNumBasedIndentState = IndentStateUp
} else if s.prevIndentNum == s.indentNum {
s.indentLevel = s.prevIndentLevel
indentNumBasedIndentState = IndentStateEqual
} else {
indentNumBasedIndentState = IndentStateDown
if s.prevIndentLevel > 0 {
s.indentLevel = s.prevIndentLevel - 1
}
}
if s.prevIndentColumn > 0 {
if s.prevIndentColumn < s.column {
s.indentState = IndentStateUp
} else if s.prevIndentColumn != s.column || indentNumBasedIndentState != IndentStateEqual {
// The following case ( current position is 'd' ), some variables becomes like here
// - prevIndentColumn: 1 of 'a'
// - indentNumBasedIndentState: IndentStateDown because d's indentNum(1) is less than c's indentNum(3).
// Therefore, s.prevIndentColumn(1) == s.column(1) is true, but we want to treat this as IndentStateDown.
// So, we look also current indentState value by the above prevIndentNum based logic, and determins finally indentState.
// ---
// a:
// b
// c
// d: e
// ^
s.indentState = IndentStateDown
} else {
s.indentState = IndentStateEqual
}
} else {
s.indentState = indentNumBasedIndentState
}
}
func (s *Scanner) updateIndent(ctx *Context, c rune) {
if s.isFirstCharAtLine && s.isNewLineChar(c) && ctx.isDocument() {
return
}
if s.isFirstCharAtLine && c == ' ' {
s.indentNum++
return
}
if !s.isFirstCharAtLine {
s.indentState = IndentStateKeep
return
}
s.updateIndentState(ctx)
s.isFirstCharAtLine = false
if s.isNeededKeepPreviousIndentNum(ctx, c) {
return
}
if s.indentState != IndentStateUp {
s.prevIndentColumn = 0
}
s.prevIndentNum = s.indentNum
s.prevIndentLevel = s.indentLevel
}
func (s *Scanner) isChangedToIndentStateDown() bool {
return s.indentState == IndentStateDown
}
func (s *Scanner) isChangedToIndentStateUp() bool {
return s.indentState == IndentStateUp
}
func (s *Scanner) isChangedToIndentStateEqual() bool {
return s.indentState == IndentStateEqual
}
func (s *Scanner) addBufferedTokenIfExists(ctx *Context) {
ctx.addToken(s.bufferedToken(ctx))
}
func (s *Scanner) breakLiteral(ctx *Context) {
s.docStartColumn = 0
ctx.breakLiteral()
}
func (s *Scanner) scanSingleQuote(ctx *Context) (tk *token.Token, pos int) {
ctx.addOriginBuf('\'')
srcpos := s.pos()
startIndex := ctx.idx + 1
src := ctx.src
size := len(src)
value := []rune{}
isFirstLineChar := false
isNewLine := false
for idx := startIndex; idx < size; idx++ {
if !isNewLine {
s.progressColumn(ctx, 1)
} else {
isNewLine = false
}
c := src[idx]
pos = idx + 1
ctx.addOriginBuf(c)
if s.isNewLineChar(c) {
value = append(value, ' ')
isFirstLineChar = true
isNewLine = true
s.progressLine(ctx)
continue
} else if c == ' ' && isFirstLineChar {
continue
} else if c != '\'' {
value = append(value, c)
isFirstLineChar = false
continue
}
if idx+1 < len(ctx.src) && ctx.src[idx+1] == '\'' {
// '' handle as ' character
value = append(value, c)
ctx.addOriginBuf(c)
idx++
continue
}
s.progressColumn(ctx, 1)
tk = token.SingleQuote(string(value), string(ctx.obuf), srcpos)
pos = idx - startIndex + 1
return
}
return
}
func hexToInt(b rune) int {
if b >= 'A' && b <= 'F' {
return int(b) - 'A' + 10
}
if b >= 'a' && b <= 'f' {
return int(b) - 'a' + 10
}
return int(b) - '0'
}
func hexRunesToInt(b []rune) int {
sum := 0
for i := 0; i < len(b); i++ {
sum += hexToInt(b[i]) << (uint(len(b)-i-1) * 4)
}
return sum
}
func (s *Scanner) scanDoubleQuote(ctx *Context) (tk *token.Token, pos int) {
ctx.addOriginBuf('"')
srcpos := s.pos()
startIndex := ctx.idx + 1
src := ctx.src
size := len(src)
value := []rune{}
isFirstLineChar := false
isNewLine := false
for idx := startIndex; idx < size; idx++ {
if !isNewLine {
s.progressColumn(ctx, 1)
} else {
isNewLine = false
}
c := src[idx]
pos = idx + 1
ctx.addOriginBuf(c)
if s.isNewLineChar(c) {
value = append(value, ' ')
isFirstLineChar = true
isNewLine = true
s.progressLine(ctx)
continue
} else if c == ' ' && isFirstLineChar {
continue
} else if c == '\\' {
isFirstLineChar = false
if idx+1 < size {
nextChar := src[idx+1]
switch nextChar {
case 'b':
ctx.addOriginBuf(nextChar)
value = append(value, '\b')
idx++
continue
case 'e':
ctx.addOriginBuf(nextChar)
value = append(value, '\x1B')
idx++
continue
case 'f':
ctx.addOriginBuf(nextChar)
value = append(value, '\f')
idx++
continue
case 'n':
ctx.addOriginBuf(nextChar)
value = append(value, '\n')
idx++
continue
case 'v':
ctx.addOriginBuf(nextChar)
value = append(value, '\v')
idx++
continue
case 'L': // LS (#x2028)
ctx.addOriginBuf(nextChar)
value = append(value, []rune{'\xE2', '\x80', '\xA8'}...)
idx++
continue
case 'N': // NEL (#x85)
ctx.addOriginBuf(nextChar)
value = append(value, []rune{'\xC2', '\x85'}...)
idx++
continue
case 'P': // PS (#x2029)
ctx.addOriginBuf(nextChar)
value = append(value, []rune{'\xE2', '\x80', '\xA9'}...)
idx++
continue
case '_': // #xA0
ctx.addOriginBuf(nextChar)
value = append(value, []rune{'\xC2', '\xA0'}...)
idx++
continue
case '"':
ctx.addOriginBuf(nextChar)
value = append(value, nextChar)
idx++
continue
case 'x':
if idx+3 >= size {
// TODO: need to return error
//err = xerrors.New("invalid escape character \\x")
return
}
codeNum := hexRunesToInt(src[idx+2 : idx+4])
value = append(value, rune(codeNum))
idx += 3
continue
case 'u':
if idx+5 >= size {
// TODO: need to return error
//err = xerrors.New("invalid escape character \\u")
return
}
codeNum := hexRunesToInt(src[idx+2 : idx+6])
value = append(value, rune(codeNum))
idx += 5
continue
case 'U':
if idx+9 >= size {
// TODO: need to return error
//err = xerrors.New("invalid escape character \\U")
return
}
codeNum := hexRunesToInt(src[idx+2 : idx+10])
value = append(value, rune(codeNum))
idx += 9
continue
case '\\':
ctx.addOriginBuf(nextChar)
idx++
}
}
value = append(value, c)
continue
} else if c != '"' {
value = append(value, c)
isFirstLineChar = false
continue
}
s.progressColumn(ctx, 1)
tk = token.DoubleQuote(string(value), string(ctx.obuf), srcpos)
pos = idx - startIndex + 1
return
}
return
}
func (s *Scanner) scanQuote(ctx *Context, ch rune) (tk *token.Token, pos int) {
if ch == '\'' {
return s.scanSingleQuote(ctx)
}
return s.scanDoubleQuote(ctx)
}
func (s *Scanner) isMergeKey(ctx *Context) bool {
if ctx.repeatNum('<') != 2 {
return false
}
src := ctx.src
size := len(src)
for idx := ctx.idx + 2; idx < size; idx++ {
c := src[idx]
if c == ' ' {
continue
}
if c != ':' {
return false
}
if idx+1 < size {
nc := src[idx+1]
if nc == ' ' || s.isNewLineChar(nc) {
return true
}
}
}
return false
}
func (s *Scanner) scanTag(ctx *Context) (tk *token.Token, pos int) {
ctx.addOriginBuf('!')
ctx.progress(1) // skip '!' character
for idx, c := range ctx.src[ctx.idx:] {
pos = idx + 1
ctx.addOriginBuf(c)
switch c {
case ' ', '\n', '\r':
value := ctx.source(ctx.idx-1, ctx.idx+idx)
tk = token.Tag(value, string(ctx.obuf), s.pos())
pos = len([]rune(value))
return
}
}
return
}
func (s *Scanner) scanComment(ctx *Context) (tk *token.Token, pos int) {
ctx.addOriginBuf('#')
ctx.progress(1) // skip '#' character
for idx, c := range ctx.src[ctx.idx:] {
pos = idx + 1
ctx.addOriginBuf(c)
switch c {
case '\n', '\r':
if ctx.previousChar() == '\\' {
continue
}
value := ctx.source(ctx.idx, ctx.idx+idx)
tk = token.Comment(value, string(ctx.obuf), s.pos())
pos = len([]rune(value)) + 1
return
}
}
return
}
func trimCommentFromLiteralOpt(text string) (string, error) {
idx := strings.Index(text, "#")
if idx < 0 {
return text, nil
}
if idx == 0 {
return "", xerrors.New("invalid literal header")
}
return text[:idx-1], nil
}
func (s *Scanner) scanLiteral(ctx *Context, c rune) {
ctx.addOriginBuf(c)
if ctx.isEOS() {
if ctx.isLiteral {
ctx.addBuf(c)
}
value := ctx.bufferedSrc()
ctx.addToken(token.String(string(value), string(ctx.obuf), s.pos()))
ctx.resetBuffer()
s.progressColumn(ctx, 1)
} else if s.isNewLineChar(c) {
if ctx.isLiteral {
ctx.addBuf(c)
} else {
ctx.addBuf(' ')
}
s.progressLine(ctx)
} else if s.isFirstCharAtLine && c == ' ' {
if 0 < s.docStartColumn && s.docStartColumn <= s.column {
ctx.addBuf(c)
}
s.progressColumn(ctx, 1)
} else {
if s.docStartColumn == 0 {
s.docStartColumn = s.column
}
ctx.addBuf(c)
s.progressColumn(ctx, 1)
}
}
func (s *Scanner) scanLiteralHeader(ctx *Context) (pos int, err error) {
header := ctx.currentChar()
ctx.addOriginBuf(header)
ctx.progress(1) // skip '|' or '>' character
for idx, c := range ctx.src[ctx.idx:] {
pos = idx
ctx.addOriginBuf(c)
switch c {
case '\n', '\r':
value := ctx.source(ctx.idx, ctx.idx+idx)
opt := strings.TrimRight(value, " ")
orgOptLen := len(opt)
opt, err = trimCommentFromLiteralOpt(opt)
if err != nil {
return
}
switch opt {
case "", "+", "-",
"0", "1", "2", "3", "4", "5", "6", "7", "8", "9":
hasComment := len(opt) < orgOptLen
if header == '|' {
if hasComment {
commentLen := orgOptLen - len(opt)
headerPos := strings.Index(string(ctx.obuf), "|")
litBuf := ctx.obuf[:len(ctx.obuf)-commentLen-headerPos]
commentBuf := ctx.obuf[len(litBuf):]
ctx.addToken(token.Literal("|"+opt, string(litBuf), s.pos()))
s.column += len(litBuf)
s.offset += len(litBuf)
commentHeader := strings.Index(value, "#")
ctx.addToken(token.Comment(string(value[commentHeader+1:]), string(commentBuf), s.pos()))
} else {
ctx.addToken(token.Literal("|"+opt, string(ctx.obuf), s.pos()))
}
ctx.isLiteral = true
} else if header == '>' {
if hasComment {
commentLen := orgOptLen - len(opt)
headerPos := strings.Index(string(ctx.obuf), ">")
foldedBuf := ctx.obuf[:len(ctx.obuf)-commentLen-headerPos]
commentBuf := ctx.obuf[len(foldedBuf):]
ctx.addToken(token.Folded(">"+opt, string(foldedBuf), s.pos()))
s.column += len(foldedBuf)
s.offset += len(foldedBuf)
commentHeader := strings.Index(value, "#")
ctx.addToken(token.Comment(string(value[commentHeader+1:]), string(commentBuf), s.pos()))
} else {
ctx.addToken(token.Folded(">"+opt, string(ctx.obuf), s.pos()))
}
ctx.isFolded = true
}
s.indentState = IndentStateKeep
ctx.resetBuffer()
ctx.literalOpt = opt
return
}
break
}
}
err = xerrors.New("invalid literal header")
return
}
func (s *Scanner) scanNewLine(ctx *Context, c rune) {
if len(ctx.buf) > 0 && s.savedPos == nil {
s.savedPos = s.pos()
s.savedPos.Column -= len(ctx.bufferedSrc())
}
// if the following case, origin buffer has unnecessary two spaces.
// So, `removeRightSpaceFromOriginBuf` remove them, also fix column number too.
// ---
// a:[space][space]
// b: c
removedNum := ctx.removeRightSpaceFromBuf()
if removedNum > 0 {
s.column -= removedNum
s.offset -= removedNum
if s.savedPos != nil {
s.savedPos.Column -= removedNum
}
}
if ctx.isEOS() {
s.addBufferedTokenIfExists(ctx)
} else if s.isAnchor {
s.addBufferedTokenIfExists(ctx)
}
ctx.addBuf(' ')
ctx.addOriginBuf(c)
ctx.isSingleLine = false
s.progressLine(ctx)
}
func (s *Scanner) scan(ctx *Context) (pos int) {
for ctx.next() {
pos = ctx.nextPos()
c := ctx.currentChar()
s.updateIndent(ctx, c)
if ctx.isDocument() {
if s.isChangedToIndentStateEqual() ||
s.isChangedToIndentStateDown() {
s.addBufferedTokenIfExists(ctx)
s.breakLiteral(ctx)
} else {
s.scanLiteral(ctx, c)
continue
}
} else if s.isChangedToIndentStateDown() {
s.addBufferedTokenIfExists(ctx)
} else if s.isChangedToIndentStateEqual() {
// if first character is new line character, buffer expect to raw folded literal
if len(ctx.obuf) > 0 && s.newLineCount(ctx.obuf) <= 1 {
// doesn't raw folded literal
s.addBufferedTokenIfExists(ctx)
}
}
switch c {
case '{':
if !ctx.existsBuffer() {
ctx.addOriginBuf(c)
ctx.addToken(token.MappingStart(string(ctx.obuf), s.pos()))
s.startedFlowMapNum++
s.progressColumn(ctx, 1)
return
}
case '}':
if !ctx.existsBuffer() || s.startedFlowMapNum > 0 {
ctx.addToken(s.bufferedToken(ctx))
ctx.addOriginBuf(c)
ctx.addToken(token.MappingEnd(string(ctx.obuf), s.pos()))
s.startedFlowMapNum--
s.progressColumn(ctx, 1)
return
}
case '.':
if s.indentNum == 0 && s.column == 1 && ctx.repeatNum('.') == 3 {
ctx.addToken(token.DocumentEnd(string(ctx.obuf)+"...", s.pos()))
s.progressColumn(ctx, 3)
pos += 2
return
}
case '<':
if s.isMergeKey(ctx) {
s.prevIndentColumn = s.column
ctx.addToken(token.MergeKey(string(ctx.obuf)+"<<", s.pos()))
s.progressColumn(ctx, 1)
pos++
return
}
case '-':
if s.indentNum == 0 && s.column == 1 && ctx.repeatNum('-') == 3 {
s.addBufferedTokenIfExists(ctx)
ctx.addToken(token.DocumentHeader(string(ctx.obuf)+"---", s.pos()))
s.progressColumn(ctx, 3)
pos += 2
return
}
if ctx.existsBuffer() && s.isChangedToIndentStateUp() {
// raw folded
ctx.isRawFolded = true
ctx.addBuf(c)
ctx.addOriginBuf(c)
s.progressColumn(ctx, 1)
continue
}
if ctx.existsBuffer() {
// '-' is literal
ctx.addBuf(c)
ctx.addOriginBuf(c)
s.progressColumn(ctx, 1)
continue
}
nc := ctx.nextChar()
if nc == ' ' || s.isNewLineChar(nc) {
s.addBufferedTokenIfExists(ctx)
ctx.addOriginBuf(c)
tk := token.SequenceEntry(string(ctx.obuf), s.pos())
s.prevIndentColumn = tk.Position.Column
ctx.addToken(tk)
s.progressColumn(ctx, 1)
return
}
case '[':
if !ctx.existsBuffer() {
ctx.addOriginBuf(c)
ctx.addToken(token.SequenceStart(string(ctx.obuf), s.pos()))
s.startedFlowSequenceNum++
s.progressColumn(ctx, 1)
return
}
case ']':
if !ctx.existsBuffer() || s.startedFlowSequenceNum > 0 {
s.addBufferedTokenIfExists(ctx)
ctx.addOriginBuf(c)
ctx.addToken(token.SequenceEnd(string(ctx.obuf), s.pos()))
s.startedFlowSequenceNum--
s.progressColumn(ctx, 1)
return
}
case ',':
if s.startedFlowSequenceNum > 0 || s.startedFlowMapNum > 0 {
s.addBufferedTokenIfExists(ctx)
ctx.addOriginBuf(c)
ctx.addToken(token.CollectEntry(string(ctx.obuf), s.pos()))
s.progressColumn(ctx, 1)
return
}
case ':':
nc := ctx.nextChar()
if s.startedFlowMapNum > 0 || nc == ' ' || s.isNewLineChar(nc) || ctx.isNextEOS() {
// mapping value
tk := s.bufferedToken(ctx)
if tk != nil {
s.prevIndentColumn = tk.Position.Column
ctx.addToken(tk)
}
ctx.addToken(token.MappingValue(s.pos()))
s.progressColumn(ctx, 1)
return
}
case '|', '>':
if !ctx.existsBuffer() {
progress, err := s.scanLiteralHeader(ctx)
if err != nil {
// TODO: returns syntax error object
return
}
s.progressColumn(ctx, progress)
s.progressLine(ctx)
continue
}
case '!':
if !ctx.existsBuffer() {
token, progress := s.scanTag(ctx)
ctx.addToken(token)
s.progressColumn(ctx, progress)
if c := ctx.previousChar(); s.isNewLineChar(c) {
s.progressLine(ctx)
}
pos += progress
return
}
case '%':
if !ctx.existsBuffer() && s.indentNum == 0 {
ctx.addToken(token.Directive(string(ctx.obuf)+"%", s.pos()))
s.progressColumn(ctx, 1)
return
}
case '?':
nc := ctx.nextChar()
if !ctx.existsBuffer() && nc == ' ' {
ctx.addToken(token.MappingKey(s.pos()))
s.progressColumn(ctx, 1)
return
}
case '&':
if !ctx.existsBuffer() {
s.addBufferedTokenIfExists(ctx)
ctx.addOriginBuf(c)
ctx.addToken(token.Anchor(string(ctx.obuf), s.pos()))
s.progressColumn(ctx, 1)
s.isAnchor = true
return
}
case '*':
if !ctx.existsBuffer() {
s.addBufferedTokenIfExists(ctx)
ctx.addOriginBuf(c)
ctx.addToken(token.Alias(string(ctx.obuf), s.pos()))
s.progressColumn(ctx, 1)
return
}
case '#':
if !ctx.existsBuffer() || ctx.previousChar() == ' ' {
s.addBufferedTokenIfExists(ctx)
token, progress := s.scanComment(ctx)
ctx.addToken(token)
s.progressColumn(ctx, progress)
s.progressLine(ctx)
pos += progress
return
}
case '\'', '"':
if !ctx.existsBuffer() {
token, progress := s.scanQuote(ctx, c)
ctx.addToken(token)
pos += progress
return
}
case '\r', '\n':
// There is no problem that we ignore CR which followed by LF and normalize it to LF, because of following YAML1.2 spec.
// > Line breaks inside scalar content must be normalized by the YAML processor. Each such line break must be parsed into a single line feed character.
// > Outside scalar content, YAML allows any line break to be used to terminate lines.
// > -- https://yaml.org/spec/1.2/spec.html
if c == '\r' && ctx.nextChar() == '\n' {
ctx.addOriginBuf('\r')
ctx.progress(1)
c = '\n'
}
s.scanNewLine(ctx, c)
continue
case ' ':
if ctx.isSaveIndentMode() || (!s.isAnchor && !s.isFirstCharAtLine) {
ctx.addBuf(c)
ctx.addOriginBuf(c)
s.progressColumn(ctx, 1)
continue
}
if s.isFirstCharAtLine {
s.progressColumn(ctx, 1)
ctx.addOriginBuf(c)
continue
}
s.addBufferedTokenIfExists(ctx)
pos-- // to rescan white space at next scanning for adding white space to next buffer.
s.isAnchor = false
return
}
ctx.addBuf(c)
ctx.addOriginBuf(c)
s.progressColumn(ctx, 1)
}
s.addBufferedTokenIfExists(ctx)
return
}
// Init prepares the scanner s to tokenize the text src by setting the scanner at the beginning of src.
func (s *Scanner) Init(text string) {
src := []rune(text)
s.source = src
s.sourcePos = 0
s.sourceSize = len(src)
s.line = 1
s.column = 1
s.offset = 1
s.prevIndentLevel = 0
s.prevIndentNum = 0
s.prevIndentColumn = 0
s.indentLevel = 0
s.indentNum = 0
s.isFirstCharAtLine = true
}
// Scan scans the next token and returns the token collection. The source end is indicated by io.EOF.
func (s *Scanner) Scan() (token.Tokens, error) {
if s.sourcePos >= s.sourceSize {
return nil, io.EOF
}
ctx := newContext(s.source[s.sourcePos:])
defer ctx.release()
progress := s.scan(ctx)
s.sourcePos += progress
var tokens token.Tokens
tokens = append(tokens, ctx.tokens...)
return tokens, nil
}