mirror of
https://github.com/superseriousbusiness/gotosocial.git
synced 2024-12-23 10:42:12 +00:00
1032 lines
19 KiB
Go
1032 lines
19 KiB
Go
|
// TOML lexer.
|
||
|
//
|
||
|
// Written using the principles developed by Rob Pike in
|
||
|
// http://www.youtube.com/watch?v=HxaD_trXwRE
|
||
|
|
||
|
package toml
|
||
|
|
||
|
import (
|
||
|
"bytes"
|
||
|
"errors"
|
||
|
"fmt"
|
||
|
"strconv"
|
||
|
"strings"
|
||
|
)
|
||
|
|
||
|
// Define state functions
|
||
|
type tomlLexStateFn func() tomlLexStateFn
|
||
|
|
||
|
// Define lexer
|
||
|
type tomlLexer struct {
|
||
|
inputIdx int
|
||
|
input []rune // Textual source
|
||
|
currentTokenStart int
|
||
|
currentTokenStop int
|
||
|
tokens []token
|
||
|
brackets []rune
|
||
|
line int
|
||
|
col int
|
||
|
endbufferLine int
|
||
|
endbufferCol int
|
||
|
}
|
||
|
|
||
|
// Basic read operations on input
|
||
|
|
||
|
func (l *tomlLexer) read() rune {
|
||
|
r := l.peek()
|
||
|
if r == '\n' {
|
||
|
l.endbufferLine++
|
||
|
l.endbufferCol = 1
|
||
|
} else {
|
||
|
l.endbufferCol++
|
||
|
}
|
||
|
l.inputIdx++
|
||
|
return r
|
||
|
}
|
||
|
|
||
|
func (l *tomlLexer) next() rune {
|
||
|
r := l.read()
|
||
|
|
||
|
if r != eof {
|
||
|
l.currentTokenStop++
|
||
|
}
|
||
|
return r
|
||
|
}
|
||
|
|
||
|
func (l *tomlLexer) ignore() {
|
||
|
l.currentTokenStart = l.currentTokenStop
|
||
|
l.line = l.endbufferLine
|
||
|
l.col = l.endbufferCol
|
||
|
}
|
||
|
|
||
|
func (l *tomlLexer) skip() {
|
||
|
l.next()
|
||
|
l.ignore()
|
||
|
}
|
||
|
|
||
|
func (l *tomlLexer) fastForward(n int) {
|
||
|
for i := 0; i < n; i++ {
|
||
|
l.next()
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func (l *tomlLexer) emitWithValue(t tokenType, value string) {
|
||
|
l.tokens = append(l.tokens, token{
|
||
|
Position: Position{l.line, l.col},
|
||
|
typ: t,
|
||
|
val: value,
|
||
|
})
|
||
|
l.ignore()
|
||
|
}
|
||
|
|
||
|
func (l *tomlLexer) emit(t tokenType) {
|
||
|
l.emitWithValue(t, string(l.input[l.currentTokenStart:l.currentTokenStop]))
|
||
|
}
|
||
|
|
||
|
func (l *tomlLexer) peek() rune {
|
||
|
if l.inputIdx >= len(l.input) {
|
||
|
return eof
|
||
|
}
|
||
|
return l.input[l.inputIdx]
|
||
|
}
|
||
|
|
||
|
func (l *tomlLexer) peekString(size int) string {
|
||
|
maxIdx := len(l.input)
|
||
|
upperIdx := l.inputIdx + size // FIXME: potential overflow
|
||
|
if upperIdx > maxIdx {
|
||
|
upperIdx = maxIdx
|
||
|
}
|
||
|
return string(l.input[l.inputIdx:upperIdx])
|
||
|
}
|
||
|
|
||
|
func (l *tomlLexer) follow(next string) bool {
|
||
|
return next == l.peekString(len(next))
|
||
|
}
|
||
|
|
||
|
// Error management
|
||
|
|
||
|
func (l *tomlLexer) errorf(format string, args ...interface{}) tomlLexStateFn {
|
||
|
l.tokens = append(l.tokens, token{
|
||
|
Position: Position{l.line, l.col},
|
||
|
typ: tokenError,
|
||
|
val: fmt.Sprintf(format, args...),
|
||
|
})
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
// State functions
|
||
|
|
||
|
func (l *tomlLexer) lexVoid() tomlLexStateFn {
|
||
|
for {
|
||
|
next := l.peek()
|
||
|
switch next {
|
||
|
case '}': // after '{'
|
||
|
return l.lexRightCurlyBrace
|
||
|
case '[':
|
||
|
return l.lexTableKey
|
||
|
case '#':
|
||
|
return l.lexComment(l.lexVoid)
|
||
|
case '=':
|
||
|
return l.lexEqual
|
||
|
case '\r':
|
||
|
fallthrough
|
||
|
case '\n':
|
||
|
l.skip()
|
||
|
continue
|
||
|
}
|
||
|
|
||
|
if isSpace(next) {
|
||
|
l.skip()
|
||
|
}
|
||
|
|
||
|
if isKeyStartChar(next) {
|
||
|
return l.lexKey
|
||
|
}
|
||
|
|
||
|
if next == eof {
|
||
|
l.next()
|
||
|
break
|
||
|
}
|
||
|
}
|
||
|
|
||
|
l.emit(tokenEOF)
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
func (l *tomlLexer) lexRvalue() tomlLexStateFn {
|
||
|
for {
|
||
|
next := l.peek()
|
||
|
switch next {
|
||
|
case '.':
|
||
|
return l.errorf("cannot start float with a dot")
|
||
|
case '=':
|
||
|
return l.lexEqual
|
||
|
case '[':
|
||
|
return l.lexLeftBracket
|
||
|
case ']':
|
||
|
return l.lexRightBracket
|
||
|
case '{':
|
||
|
return l.lexLeftCurlyBrace
|
||
|
case '}':
|
||
|
return l.lexRightCurlyBrace
|
||
|
case '#':
|
||
|
return l.lexComment(l.lexRvalue)
|
||
|
case '"':
|
||
|
return l.lexString
|
||
|
case '\'':
|
||
|
return l.lexLiteralString
|
||
|
case ',':
|
||
|
return l.lexComma
|
||
|
case '\r':
|
||
|
fallthrough
|
||
|
case '\n':
|
||
|
l.skip()
|
||
|
if len(l.brackets) > 0 && l.brackets[len(l.brackets)-1] == '[' {
|
||
|
return l.lexRvalue
|
||
|
}
|
||
|
return l.lexVoid
|
||
|
}
|
||
|
|
||
|
if l.follow("true") {
|
||
|
return l.lexTrue
|
||
|
}
|
||
|
|
||
|
if l.follow("false") {
|
||
|
return l.lexFalse
|
||
|
}
|
||
|
|
||
|
if l.follow("inf") {
|
||
|
return l.lexInf
|
||
|
}
|
||
|
|
||
|
if l.follow("nan") {
|
||
|
return l.lexNan
|
||
|
}
|
||
|
|
||
|
if isSpace(next) {
|
||
|
l.skip()
|
||
|
continue
|
||
|
}
|
||
|
|
||
|
if next == eof {
|
||
|
l.next()
|
||
|
break
|
||
|
}
|
||
|
|
||
|
if next == '+' || next == '-' {
|
||
|
return l.lexNumber
|
||
|
}
|
||
|
|
||
|
if isDigit(next) {
|
||
|
return l.lexDateTimeOrNumber
|
||
|
}
|
||
|
|
||
|
return l.errorf("no value can start with %c", next)
|
||
|
}
|
||
|
|
||
|
l.emit(tokenEOF)
|
||
|
return nil
|
||
|
}
|
||
|
|
||
|
func (l *tomlLexer) lexDateTimeOrNumber() tomlLexStateFn {
|
||
|
// Could be either a date/time, or a digit.
|
||
|
// The options for date/times are:
|
||
|
// YYYY-... => date or date-time
|
||
|
// HH:... => time
|
||
|
// Anything else should be a number.
|
||
|
|
||
|
lookAhead := l.peekString(5)
|
||
|
if len(lookAhead) < 3 {
|
||
|
return l.lexNumber()
|
||
|
}
|
||
|
|
||
|
for idx, r := range lookAhead {
|
||
|
if !isDigit(r) {
|
||
|
if idx == 2 && r == ':' {
|
||
|
return l.lexDateTimeOrTime()
|
||
|
}
|
||
|
if idx == 4 && r == '-' {
|
||
|
return l.lexDateTimeOrTime()
|
||
|
}
|
||
|
return l.lexNumber()
|
||
|
}
|
||
|
}
|
||
|
return l.lexNumber()
|
||
|
}
|
||
|
|
||
|
func (l *tomlLexer) lexLeftCurlyBrace() tomlLexStateFn {
|
||
|
l.next()
|
||
|
l.emit(tokenLeftCurlyBrace)
|
||
|
l.brackets = append(l.brackets, '{')
|
||
|
return l.lexVoid
|
||
|
}
|
||
|
|
||
|
func (l *tomlLexer) lexRightCurlyBrace() tomlLexStateFn {
|
||
|
l.next()
|
||
|
l.emit(tokenRightCurlyBrace)
|
||
|
if len(l.brackets) == 0 || l.brackets[len(l.brackets)-1] != '{' {
|
||
|
return l.errorf("cannot have '}' here")
|
||
|
}
|
||
|
l.brackets = l.brackets[:len(l.brackets)-1]
|
||
|
return l.lexRvalue
|
||
|
}
|
||
|
|
||
|
func (l *tomlLexer) lexDateTimeOrTime() tomlLexStateFn {
|
||
|
// Example matches:
|
||
|
// 1979-05-27T07:32:00Z
|
||
|
// 1979-05-27T00:32:00-07:00
|
||
|
// 1979-05-27T00:32:00.999999-07:00
|
||
|
// 1979-05-27 07:32:00Z
|
||
|
// 1979-05-27 00:32:00-07:00
|
||
|
// 1979-05-27 00:32:00.999999-07:00
|
||
|
// 1979-05-27T07:32:00
|
||
|
// 1979-05-27T00:32:00.999999
|
||
|
// 1979-05-27 07:32:00
|
||
|
// 1979-05-27 00:32:00.999999
|
||
|
// 1979-05-27
|
||
|
// 07:32:00
|
||
|
// 00:32:00.999999
|
||
|
|
||
|
// we already know those two are digits
|
||
|
l.next()
|
||
|
l.next()
|
||
|
|
||
|
// Got 2 digits. At that point it could be either a time or a date(-time).
|
||
|
|
||
|
r := l.next()
|
||
|
if r == ':' {
|
||
|
return l.lexTime()
|
||
|
}
|
||
|
|
||
|
return l.lexDateTime()
|
||
|
}
|
||
|
|
||
|
func (l *tomlLexer) lexDateTime() tomlLexStateFn {
|
||
|
// This state accepts an offset date-time, a local date-time, or a local date.
|
||
|
//
|
||
|
// v--- cursor
|
||
|
// 1979-05-27T07:32:00Z
|
||
|
// 1979-05-27T00:32:00-07:00
|
||
|
// 1979-05-27T00:32:00.999999-07:00
|
||
|
// 1979-05-27 07:32:00Z
|
||
|
// 1979-05-27 00:32:00-07:00
|
||
|
// 1979-05-27 00:32:00.999999-07:00
|
||
|
// 1979-05-27T07:32:00
|
||
|
// 1979-05-27T00:32:00.999999
|
||
|
// 1979-05-27 07:32:00
|
||
|
// 1979-05-27 00:32:00.999999
|
||
|
// 1979-05-27
|
||
|
|
||
|
// date
|
||
|
|
||
|
// already checked by lexRvalue
|
||
|
l.next() // digit
|
||
|
l.next() // -
|
||
|
|
||
|
for i := 0; i < 2; i++ {
|
||
|
r := l.next()
|
||
|
if !isDigit(r) {
|
||
|
return l.errorf("invalid month digit in date: %c", r)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
r := l.next()
|
||
|
if r != '-' {
|
||
|
return l.errorf("expected - to separate month of a date, not %c", r)
|
||
|
}
|
||
|
|
||
|
for i := 0; i < 2; i++ {
|
||
|
r := l.next()
|
||
|
if !isDigit(r) {
|
||
|
return l.errorf("invalid day digit in date: %c", r)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
l.emit(tokenLocalDate)
|
||
|
|
||
|
r = l.peek()
|
||
|
|
||
|
if r == eof {
|
||
|
|
||
|
return l.lexRvalue
|
||
|
}
|
||
|
|
||
|
if r != ' ' && r != 'T' {
|
||
|
return l.errorf("incorrect date/time separation character: %c", r)
|
||
|
}
|
||
|
|
||
|
if r == ' ' {
|
||
|
lookAhead := l.peekString(3)[1:]
|
||
|
if len(lookAhead) < 2 {
|
||
|
return l.lexRvalue
|
||
|
}
|
||
|
for _, r := range lookAhead {
|
||
|
if !isDigit(r) {
|
||
|
return l.lexRvalue
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
l.skip() // skip the T or ' '
|
||
|
|
||
|
// time
|
||
|
|
||
|
for i := 0; i < 2; i++ {
|
||
|
r := l.next()
|
||
|
if !isDigit(r) {
|
||
|
return l.errorf("invalid hour digit in time: %c", r)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
r = l.next()
|
||
|
if r != ':' {
|
||
|
return l.errorf("time hour/minute separator should be :, not %c", r)
|
||
|
}
|
||
|
|
||
|
for i := 0; i < 2; i++ {
|
||
|
r := l.next()
|
||
|
if !isDigit(r) {
|
||
|
return l.errorf("invalid minute digit in time: %c", r)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
r = l.next()
|
||
|
if r != ':' {
|
||
|
return l.errorf("time minute/second separator should be :, not %c", r)
|
||
|
}
|
||
|
|
||
|
for i := 0; i < 2; i++ {
|
||
|
r := l.next()
|
||
|
if !isDigit(r) {
|
||
|
return l.errorf("invalid second digit in time: %c", r)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
r = l.peek()
|
||
|
if r == '.' {
|
||
|
l.next()
|
||
|
r := l.next()
|
||
|
if !isDigit(r) {
|
||
|
return l.errorf("expected at least one digit in time's fraction, not %c", r)
|
||
|
}
|
||
|
|
||
|
for {
|
||
|
r := l.peek()
|
||
|
if !isDigit(r) {
|
||
|
break
|
||
|
}
|
||
|
l.next()
|
||
|
}
|
||
|
}
|
||
|
|
||
|
l.emit(tokenLocalTime)
|
||
|
|
||
|
return l.lexTimeOffset
|
||
|
|
||
|
}
|
||
|
|
||
|
func (l *tomlLexer) lexTimeOffset() tomlLexStateFn {
|
||
|
// potential offset
|
||
|
|
||
|
// Z
|
||
|
// -07:00
|
||
|
// +07:00
|
||
|
// nothing
|
||
|
|
||
|
r := l.peek()
|
||
|
|
||
|
if r == 'Z' {
|
||
|
l.next()
|
||
|
l.emit(tokenTimeOffset)
|
||
|
} else if r == '+' || r == '-' {
|
||
|
l.next()
|
||
|
|
||
|
for i := 0; i < 2; i++ {
|
||
|
r := l.next()
|
||
|
if !isDigit(r) {
|
||
|
return l.errorf("invalid hour digit in time offset: %c", r)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
r = l.next()
|
||
|
if r != ':' {
|
||
|
return l.errorf("time offset hour/minute separator should be :, not %c", r)
|
||
|
}
|
||
|
|
||
|
for i := 0; i < 2; i++ {
|
||
|
r := l.next()
|
||
|
if !isDigit(r) {
|
||
|
return l.errorf("invalid minute digit in time offset: %c", r)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
l.emit(tokenTimeOffset)
|
||
|
}
|
||
|
|
||
|
return l.lexRvalue
|
||
|
}
|
||
|
|
||
|
func (l *tomlLexer) lexTime() tomlLexStateFn {
|
||
|
// v--- cursor
|
||
|
// 07:32:00
|
||
|
// 00:32:00.999999
|
||
|
|
||
|
for i := 0; i < 2; i++ {
|
||
|
r := l.next()
|
||
|
if !isDigit(r) {
|
||
|
return l.errorf("invalid minute digit in time: %c", r)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
r := l.next()
|
||
|
if r != ':' {
|
||
|
return l.errorf("time minute/second separator should be :, not %c", r)
|
||
|
}
|
||
|
|
||
|
for i := 0; i < 2; i++ {
|
||
|
r := l.next()
|
||
|
if !isDigit(r) {
|
||
|
return l.errorf("invalid second digit in time: %c", r)
|
||
|
}
|
||
|
}
|
||
|
|
||
|
r = l.peek()
|
||
|
if r == '.' {
|
||
|
l.next()
|
||
|
r := l.next()
|
||
|
if !isDigit(r) {
|
||
|
return l.errorf("expected at least one digit in time's fraction, not %c", r)
|
||
|
}
|
||
|
|
||
|
for {
|
||
|
r := l.peek()
|
||
|
if !isDigit(r) {
|
||
|
break
|
||
|
}
|
||
|
l.next()
|
||
|
}
|
||
|
}
|
||
|
|
||
|
l.emit(tokenLocalTime)
|
||
|
return l.lexRvalue
|
||
|
|
||
|
}
|
||
|
|
||
|
func (l *tomlLexer) lexTrue() tomlLexStateFn {
|
||
|
l.fastForward(4)
|
||
|
l.emit(tokenTrue)
|
||
|
return l.lexRvalue
|
||
|
}
|
||
|
|
||
|
func (l *tomlLexer) lexFalse() tomlLexStateFn {
|
||
|
l.fastForward(5)
|
||
|
l.emit(tokenFalse)
|
||
|
return l.lexRvalue
|
||
|
}
|
||
|
|
||
|
func (l *tomlLexer) lexInf() tomlLexStateFn {
|
||
|
l.fastForward(3)
|
||
|
l.emit(tokenInf)
|
||
|
return l.lexRvalue
|
||
|
}
|
||
|
|
||
|
func (l *tomlLexer) lexNan() tomlLexStateFn {
|
||
|
l.fastForward(3)
|
||
|
l.emit(tokenNan)
|
||
|
return l.lexRvalue
|
||
|
}
|
||
|
|
||
|
func (l *tomlLexer) lexEqual() tomlLexStateFn {
|
||
|
l.next()
|
||
|
l.emit(tokenEqual)
|
||
|
return l.lexRvalue
|
||
|
}
|
||
|
|
||
|
func (l *tomlLexer) lexComma() tomlLexStateFn {
|
||
|
l.next()
|
||
|
l.emit(tokenComma)
|
||
|
if len(l.brackets) > 0 && l.brackets[len(l.brackets)-1] == '{' {
|
||
|
return l.lexVoid
|
||
|
}
|
||
|
return l.lexRvalue
|
||
|
}
|
||
|
|
||
|
// Parse the key and emits its value without escape sequences.
|
||
|
// bare keys, basic string keys and literal string keys are supported.
|
||
|
func (l *tomlLexer) lexKey() tomlLexStateFn {
|
||
|
var sb strings.Builder
|
||
|
|
||
|
for r := l.peek(); isKeyChar(r) || r == '\n' || r == '\r'; r = l.peek() {
|
||
|
if r == '"' {
|
||
|
l.next()
|
||
|
str, err := l.lexStringAsString(`"`, false, true)
|
||
|
if err != nil {
|
||
|
return l.errorf(err.Error())
|
||
|
}
|
||
|
sb.WriteString("\"")
|
||
|
sb.WriteString(str)
|
||
|
sb.WriteString("\"")
|
||
|
l.next()
|
||
|
continue
|
||
|
} else if r == '\'' {
|
||
|
l.next()
|
||
|
str, err := l.lexLiteralStringAsString(`'`, false)
|
||
|
if err != nil {
|
||
|
return l.errorf(err.Error())
|
||
|
}
|
||
|
sb.WriteString("'")
|
||
|
sb.WriteString(str)
|
||
|
sb.WriteString("'")
|
||
|
l.next()
|
||
|
continue
|
||
|
} else if r == '\n' {
|
||
|
return l.errorf("keys cannot contain new lines")
|
||
|
} else if isSpace(r) {
|
||
|
var str strings.Builder
|
||
|
str.WriteString(" ")
|
||
|
|
||
|
// skip trailing whitespace
|
||
|
l.next()
|
||
|
for r = l.peek(); isSpace(r); r = l.peek() {
|
||
|
str.WriteRune(r)
|
||
|
l.next()
|
||
|
}
|
||
|
// break loop if not a dot
|
||
|
if r != '.' {
|
||
|
break
|
||
|
}
|
||
|
str.WriteString(".")
|
||
|
// skip trailing whitespace after dot
|
||
|
l.next()
|
||
|
for r = l.peek(); isSpace(r); r = l.peek() {
|
||
|
str.WriteRune(r)
|
||
|
l.next()
|
||
|
}
|
||
|
sb.WriteString(str.String())
|
||
|
continue
|
||
|
} else if r == '.' {
|
||
|
// skip
|
||
|
} else if !isValidBareChar(r) {
|
||
|
return l.errorf("keys cannot contain %c character", r)
|
||
|
}
|
||
|
sb.WriteRune(r)
|
||
|
l.next()
|
||
|
}
|
||
|
l.emitWithValue(tokenKey, sb.String())
|
||
|
return l.lexVoid
|
||
|
}
|
||
|
|
||
|
func (l *tomlLexer) lexComment(previousState tomlLexStateFn) tomlLexStateFn {
|
||
|
return func() tomlLexStateFn {
|
||
|
for next := l.peek(); next != '\n' && next != eof; next = l.peek() {
|
||
|
if next == '\r' && l.follow("\r\n") {
|
||
|
break
|
||
|
}
|
||
|
l.next()
|
||
|
}
|
||
|
l.ignore()
|
||
|
return previousState
|
||
|
}
|
||
|
}
|
||
|
|
||
|
func (l *tomlLexer) lexLeftBracket() tomlLexStateFn {
|
||
|
l.next()
|
||
|
l.emit(tokenLeftBracket)
|
||
|
l.brackets = append(l.brackets, '[')
|
||
|
return l.lexRvalue
|
||
|
}
|
||
|
|
||
|
func (l *tomlLexer) lexLiteralStringAsString(terminator string, discardLeadingNewLine bool) (string, error) {
|
||
|
var sb strings.Builder
|
||
|
|
||
|
if discardLeadingNewLine {
|
||
|
if l.follow("\r\n") {
|
||
|
l.skip()
|
||
|
l.skip()
|
||
|
} else if l.peek() == '\n' {
|
||
|
l.skip()
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// find end of string
|
||
|
for {
|
||
|
if l.follow(terminator) {
|
||
|
return sb.String(), nil
|
||
|
}
|
||
|
|
||
|
next := l.peek()
|
||
|
if next == eof {
|
||
|
break
|
||
|
}
|
||
|
sb.WriteRune(l.next())
|
||
|
}
|
||
|
|
||
|
return "", errors.New("unclosed string")
|
||
|
}
|
||
|
|
||
|
func (l *tomlLexer) lexLiteralString() tomlLexStateFn {
|
||
|
l.skip()
|
||
|
|
||
|
// handle special case for triple-quote
|
||
|
terminator := "'"
|
||
|
discardLeadingNewLine := false
|
||
|
if l.follow("''") {
|
||
|
l.skip()
|
||
|
l.skip()
|
||
|
terminator = "'''"
|
||
|
discardLeadingNewLine = true
|
||
|
}
|
||
|
|
||
|
str, err := l.lexLiteralStringAsString(terminator, discardLeadingNewLine)
|
||
|
if err != nil {
|
||
|
return l.errorf(err.Error())
|
||
|
}
|
||
|
|
||
|
l.emitWithValue(tokenString, str)
|
||
|
l.fastForward(len(terminator))
|
||
|
l.ignore()
|
||
|
return l.lexRvalue
|
||
|
}
|
||
|
|
||
|
// Lex a string and return the results as a string.
|
||
|
// Terminator is the substring indicating the end of the token.
|
||
|
// The resulting string does not include the terminator.
|
||
|
func (l *tomlLexer) lexStringAsString(terminator string, discardLeadingNewLine, acceptNewLines bool) (string, error) {
|
||
|
var sb strings.Builder
|
||
|
|
||
|
if discardLeadingNewLine {
|
||
|
if l.follow("\r\n") {
|
||
|
l.skip()
|
||
|
l.skip()
|
||
|
} else if l.peek() == '\n' {
|
||
|
l.skip()
|
||
|
}
|
||
|
}
|
||
|
|
||
|
for {
|
||
|
if l.follow(terminator) {
|
||
|
return sb.String(), nil
|
||
|
}
|
||
|
|
||
|
if l.follow("\\") {
|
||
|
l.next()
|
||
|
switch l.peek() {
|
||
|
case '\r':
|
||
|
fallthrough
|
||
|
case '\n':
|
||
|
fallthrough
|
||
|
case '\t':
|
||
|
fallthrough
|
||
|
case ' ':
|
||
|
// skip all whitespace chars following backslash
|
||
|
for strings.ContainsRune("\r\n\t ", l.peek()) {
|
||
|
l.next()
|
||
|
}
|
||
|
case '"':
|
||
|
sb.WriteString("\"")
|
||
|
l.next()
|
||
|
case 'n':
|
||
|
sb.WriteString("\n")
|
||
|
l.next()
|
||
|
case 'b':
|
||
|
sb.WriteString("\b")
|
||
|
l.next()
|
||
|
case 'f':
|
||
|
sb.WriteString("\f")
|
||
|
l.next()
|
||
|
case '/':
|
||
|
sb.WriteString("/")
|
||
|
l.next()
|
||
|
case 't':
|
||
|
sb.WriteString("\t")
|
||
|
l.next()
|
||
|
case 'r':
|
||
|
sb.WriteString("\r")
|
||
|
l.next()
|
||
|
case '\\':
|
||
|
sb.WriteString("\\")
|
||
|
l.next()
|
||
|
case 'u':
|
||
|
l.next()
|
||
|
var code strings.Builder
|
||
|
for i := 0; i < 4; i++ {
|
||
|
c := l.peek()
|
||
|
if !isHexDigit(c) {
|
||
|
return "", errors.New("unfinished unicode escape")
|
||
|
}
|
||
|
l.next()
|
||
|
code.WriteRune(c)
|
||
|
}
|
||
|
intcode, err := strconv.ParseInt(code.String(), 16, 32)
|
||
|
if err != nil {
|
||
|
return "", errors.New("invalid unicode escape: \\u" + code.String())
|
||
|
}
|
||
|
sb.WriteRune(rune(intcode))
|
||
|
case 'U':
|
||
|
l.next()
|
||
|
var code strings.Builder
|
||
|
for i := 0; i < 8; i++ {
|
||
|
c := l.peek()
|
||
|
if !isHexDigit(c) {
|
||
|
return "", errors.New("unfinished unicode escape")
|
||
|
}
|
||
|
l.next()
|
||
|
code.WriteRune(c)
|
||
|
}
|
||
|
intcode, err := strconv.ParseInt(code.String(), 16, 64)
|
||
|
if err != nil {
|
||
|
return "", errors.New("invalid unicode escape: \\U" + code.String())
|
||
|
}
|
||
|
sb.WriteRune(rune(intcode))
|
||
|
default:
|
||
|
return "", errors.New("invalid escape sequence: \\" + string(l.peek()))
|
||
|
}
|
||
|
} else {
|
||
|
r := l.peek()
|
||
|
|
||
|
if 0x00 <= r && r <= 0x1F && r != '\t' && !(acceptNewLines && (r == '\n' || r == '\r')) {
|
||
|
return "", fmt.Errorf("unescaped control character %U", r)
|
||
|
}
|
||
|
l.next()
|
||
|
sb.WriteRune(r)
|
||
|
}
|
||
|
|
||
|
if l.peek() == eof {
|
||
|
break
|
||
|
}
|
||
|
}
|
||
|
|
||
|
return "", errors.New("unclosed string")
|
||
|
}
|
||
|
|
||
|
func (l *tomlLexer) lexString() tomlLexStateFn {
|
||
|
l.skip()
|
||
|
|
||
|
// handle special case for triple-quote
|
||
|
terminator := `"`
|
||
|
discardLeadingNewLine := false
|
||
|
acceptNewLines := false
|
||
|
if l.follow(`""`) {
|
||
|
l.skip()
|
||
|
l.skip()
|
||
|
terminator = `"""`
|
||
|
discardLeadingNewLine = true
|
||
|
acceptNewLines = true
|
||
|
}
|
||
|
|
||
|
str, err := l.lexStringAsString(terminator, discardLeadingNewLine, acceptNewLines)
|
||
|
if err != nil {
|
||
|
return l.errorf(err.Error())
|
||
|
}
|
||
|
|
||
|
l.emitWithValue(tokenString, str)
|
||
|
l.fastForward(len(terminator))
|
||
|
l.ignore()
|
||
|
return l.lexRvalue
|
||
|
}
|
||
|
|
||
|
func (l *tomlLexer) lexTableKey() tomlLexStateFn {
|
||
|
l.next()
|
||
|
|
||
|
if l.peek() == '[' {
|
||
|
// token '[[' signifies an array of tables
|
||
|
l.next()
|
||
|
l.emit(tokenDoubleLeftBracket)
|
||
|
return l.lexInsideTableArrayKey
|
||
|
}
|
||
|
// vanilla table key
|
||
|
l.emit(tokenLeftBracket)
|
||
|
return l.lexInsideTableKey
|
||
|
}
|
||
|
|
||
|
// Parse the key till "]]", but only bare keys are supported
|
||
|
func (l *tomlLexer) lexInsideTableArrayKey() tomlLexStateFn {
|
||
|
for r := l.peek(); r != eof; r = l.peek() {
|
||
|
switch r {
|
||
|
case ']':
|
||
|
if l.currentTokenStop > l.currentTokenStart {
|
||
|
l.emit(tokenKeyGroupArray)
|
||
|
}
|
||
|
l.next()
|
||
|
if l.peek() != ']' {
|
||
|
break
|
||
|
}
|
||
|
l.next()
|
||
|
l.emit(tokenDoubleRightBracket)
|
||
|
return l.lexVoid
|
||
|
case '[':
|
||
|
return l.errorf("table array key cannot contain ']'")
|
||
|
default:
|
||
|
l.next()
|
||
|
}
|
||
|
}
|
||
|
return l.errorf("unclosed table array key")
|
||
|
}
|
||
|
|
||
|
// Parse the key till "]" but only bare keys are supported
|
||
|
func (l *tomlLexer) lexInsideTableKey() tomlLexStateFn {
|
||
|
for r := l.peek(); r != eof; r = l.peek() {
|
||
|
switch r {
|
||
|
case ']':
|
||
|
if l.currentTokenStop > l.currentTokenStart {
|
||
|
l.emit(tokenKeyGroup)
|
||
|
}
|
||
|
l.next()
|
||
|
l.emit(tokenRightBracket)
|
||
|
return l.lexVoid
|
||
|
case '[':
|
||
|
return l.errorf("table key cannot contain ']'")
|
||
|
default:
|
||
|
l.next()
|
||
|
}
|
||
|
}
|
||
|
return l.errorf("unclosed table key")
|
||
|
}
|
||
|
|
||
|
func (l *tomlLexer) lexRightBracket() tomlLexStateFn {
|
||
|
l.next()
|
||
|
l.emit(tokenRightBracket)
|
||
|
if len(l.brackets) == 0 || l.brackets[len(l.brackets)-1] != '[' {
|
||
|
return l.errorf("cannot have ']' here")
|
||
|
}
|
||
|
l.brackets = l.brackets[:len(l.brackets)-1]
|
||
|
return l.lexRvalue
|
||
|
}
|
||
|
|
||
|
type validRuneFn func(r rune) bool
|
||
|
|
||
|
func isValidHexRune(r rune) bool {
|
||
|
return r >= 'a' && r <= 'f' ||
|
||
|
r >= 'A' && r <= 'F' ||
|
||
|
r >= '0' && r <= '9' ||
|
||
|
r == '_'
|
||
|
}
|
||
|
|
||
|
func isValidOctalRune(r rune) bool {
|
||
|
return r >= '0' && r <= '7' || r == '_'
|
||
|
}
|
||
|
|
||
|
func isValidBinaryRune(r rune) bool {
|
||
|
return r == '0' || r == '1' || r == '_'
|
||
|
}
|
||
|
|
||
|
func (l *tomlLexer) lexNumber() tomlLexStateFn {
|
||
|
r := l.peek()
|
||
|
|
||
|
if r == '0' {
|
||
|
follow := l.peekString(2)
|
||
|
if len(follow) == 2 {
|
||
|
var isValidRune validRuneFn
|
||
|
switch follow[1] {
|
||
|
case 'x':
|
||
|
isValidRune = isValidHexRune
|
||
|
case 'o':
|
||
|
isValidRune = isValidOctalRune
|
||
|
case 'b':
|
||
|
isValidRune = isValidBinaryRune
|
||
|
default:
|
||
|
if follow[1] >= 'a' && follow[1] <= 'z' || follow[1] >= 'A' && follow[1] <= 'Z' {
|
||
|
return l.errorf("unknown number base: %s. possible options are x (hex) o (octal) b (binary)", string(follow[1]))
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if isValidRune != nil {
|
||
|
l.next()
|
||
|
l.next()
|
||
|
digitSeen := false
|
||
|
for {
|
||
|
next := l.peek()
|
||
|
if !isValidRune(next) {
|
||
|
break
|
||
|
}
|
||
|
digitSeen = true
|
||
|
l.next()
|
||
|
}
|
||
|
|
||
|
if !digitSeen {
|
||
|
return l.errorf("number needs at least one digit")
|
||
|
}
|
||
|
|
||
|
l.emit(tokenInteger)
|
||
|
|
||
|
return l.lexRvalue
|
||
|
}
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if r == '+' || r == '-' {
|
||
|
l.next()
|
||
|
if l.follow("inf") {
|
||
|
return l.lexInf
|
||
|
}
|
||
|
if l.follow("nan") {
|
||
|
return l.lexNan
|
||
|
}
|
||
|
}
|
||
|
|
||
|
pointSeen := false
|
||
|
expSeen := false
|
||
|
digitSeen := false
|
||
|
for {
|
||
|
next := l.peek()
|
||
|
if next == '.' {
|
||
|
if pointSeen {
|
||
|
return l.errorf("cannot have two dots in one float")
|
||
|
}
|
||
|
l.next()
|
||
|
if !isDigit(l.peek()) {
|
||
|
return l.errorf("float cannot end with a dot")
|
||
|
}
|
||
|
pointSeen = true
|
||
|
} else if next == 'e' || next == 'E' {
|
||
|
expSeen = true
|
||
|
l.next()
|
||
|
r := l.peek()
|
||
|
if r == '+' || r == '-' {
|
||
|
l.next()
|
||
|
}
|
||
|
} else if isDigit(next) {
|
||
|
digitSeen = true
|
||
|
l.next()
|
||
|
} else if next == '_' {
|
||
|
l.next()
|
||
|
} else {
|
||
|
break
|
||
|
}
|
||
|
if pointSeen && !digitSeen {
|
||
|
return l.errorf("cannot start float with a dot")
|
||
|
}
|
||
|
}
|
||
|
|
||
|
if !digitSeen {
|
||
|
return l.errorf("no digit in that number")
|
||
|
}
|
||
|
if pointSeen || expSeen {
|
||
|
l.emit(tokenFloat)
|
||
|
} else {
|
||
|
l.emit(tokenInteger)
|
||
|
}
|
||
|
return l.lexRvalue
|
||
|
}
|
||
|
|
||
|
func (l *tomlLexer) run() {
|
||
|
for state := l.lexVoid; state != nil; {
|
||
|
state = state()
|
||
|
}
|
||
|
}
|
||
|
|
||
|
// Entry point
|
||
|
func lexToml(inputBytes []byte) []token {
|
||
|
runes := bytes.Runes(inputBytes)
|
||
|
l := &tomlLexer{
|
||
|
input: runes,
|
||
|
tokens: make([]token, 0, 256),
|
||
|
line: 1,
|
||
|
col: 1,
|
||
|
endbufferLine: 1,
|
||
|
endbufferCol: 1,
|
||
|
}
|
||
|
l.run()
|
||
|
return l.tokens
|
||
|
}
|