// Copyright 2012 The Gorilla Authors. All rights reserved. // Use of this source code is governed by a BSD-style // license that can be found in the LICENSE file. package scanner import ( "fmt" "regexp" "strings" "unicode" "unicode/utf8" ) // tokenType identifies the type of lexical tokens. type tokenType int // String returns a string representation of the token type. func (t tokenType) String() string { return tokenNames[t] } // Token represents a token and the corresponding string. type Token struct { Type tokenType Value string Line int Column int } // String returns a string representation of the token. func (t *Token) String() string { if len(t.Value) > 10 { return fmt.Sprintf("%s (line: %d, column: %d): %.10q...", t.Type, t.Line, t.Column, t.Value) } return fmt.Sprintf("%s (line: %d, column: %d): %q", t.Type, t.Line, t.Column, t.Value) } // All tokens ----------------------------------------------------------------- // The complete list of tokens in CSS3. const ( // Scanner flags. TokenError tokenType = iota TokenEOF // From now on, only tokens from the CSS specification. TokenIdent TokenAtKeyword TokenString TokenHash TokenNumber TokenPercentage TokenDimension TokenURI TokenUnicodeRange TokenCDO TokenCDC TokenS TokenComment TokenFunction TokenIncludes TokenDashMatch TokenPrefixMatch TokenSuffixMatch TokenSubstringMatch TokenChar TokenBOM ) // tokenNames maps tokenType's to their names. Used for conversion to string. var tokenNames = map[tokenType]string{ TokenError: "error", TokenEOF: "EOF", TokenIdent: "IDENT", TokenAtKeyword: "ATKEYWORD", TokenString: "STRING", TokenHash: "HASH", TokenNumber: "NUMBER", TokenPercentage: "PERCENTAGE", TokenDimension: "DIMENSION", TokenURI: "URI", TokenUnicodeRange: "UNICODE-RANGE", TokenCDO: "CDO", TokenCDC: "CDC", TokenS: "S", TokenComment: "COMMENT", TokenFunction: "FUNCTION", TokenIncludes: "INCLUDES", TokenDashMatch: "DASHMATCH", TokenPrefixMatch: "PREFIXMATCH", TokenSuffixMatch: "SUFFIXMATCH", TokenSubstringMatch: "SUBSTRINGMATCH", TokenChar: "CHAR", TokenBOM: "BOM", } // Macros and productions ----------------------------------------------------- // http://www.w3.org/TR/css3-syntax/#tokenization var macroRegexp = regexp.MustCompile(`\{[a-z]+\}`) // macros maps macro names to patterns to be expanded. var macros = map[string]string{ // must be escaped: `\.+*?()|[]{}^$` "ident": `-?{nmstart}{nmchar}*`, "name": `{nmchar}+`, "nmstart": `[a-zA-Z_]|{nonascii}|{escape}`, "nonascii": "[\u0080-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]", "unicode": `\\[0-9a-fA-F]{1,6}{wc}?`, "escape": "{unicode}|\\\\[\u0020-\u007E\u0080-\uD7FF\uE000-\uFFFD\U00010000-\U0010FFFF]", "nmchar": `[a-zA-Z0-9_-]|{nonascii}|{escape}`, "num": `[0-9]*\.[0-9]+|[0-9]+`, "string": `"(?:{stringchar}|')*"|'(?:{stringchar}|")*'`, "stringchar": `{urlchar}|[ ]|\\{nl}`, "nl": `[\n\r\f]|\r\n`, "w": `{wc}*`, "wc": `[\t\n\f\r ]`, // urlchar should accept [(ascii characters minus those that need escaping)|{nonascii}|{escape}] // ASCII characters range = `[\u0020-\u007e]` // Skip space \u0020 = `[\u0021-\u007e]` // Skip quotation mark \0022 = `[\u0021\u0023-\u007e]` // Skip apostrophe \u0027 = `[\u0021\u0023-\u0026\u0028-\u007e]` // Skip reverse solidus \u005c = `[\u0021\u0023-\u0026\u0028-\u005b\u005d\u007e]` // Finally, the left square bracket (\u005b) and right (\u005d) needs escaping themselves "urlchar": "[\u0021\u0023-\u0026\u0028-\\\u005b\\\u005d-\u007E]|{nonascii}|{escape}", } // productions maps the list of tokens to patterns to be expanded. var productions = map[tokenType]string{ // Unused regexps (matched using other methods) are commented out. TokenIdent: `{ident}`, TokenAtKeyword: `@{ident}`, TokenString: `{string}`, TokenHash: `#{name}`, TokenNumber: `{num}`, TokenPercentage: `{num}%`, TokenDimension: `{num}{ident}`, TokenURI: `url\({w}(?:{string}|{urlchar}*?){w}\)`, TokenUnicodeRange: `U\+[0-9A-F\?]{1,6}(?:-[0-9A-F]{1,6})?`, //TokenCDO: `<!--`, TokenCDC: `-->`, TokenS: `{wc}+`, TokenComment: `/\*[^\*]*[\*]+(?:[^/][^\*]*[\*]+)*/`, TokenFunction: `{ident}\(`, //TokenIncludes: `~=`, //TokenDashMatch: `\|=`, //TokenPrefixMatch: `\^=`, //TokenSuffixMatch: `\$=`, //TokenSubstringMatch: `\*=`, //TokenChar: `[^"']`, //TokenBOM: "\uFEFF", } // matchers maps the list of tokens to compiled regular expressions. // // The map is filled on init() using the macros and productions defined in // the CSS specification. var matchers = map[tokenType]*regexp.Regexp{} // matchOrder is the order to test regexps when first-char shortcuts // can't be used. var matchOrder = []tokenType{ TokenURI, TokenFunction, TokenUnicodeRange, TokenIdent, TokenDimension, TokenPercentage, TokenNumber, TokenCDC, } func init() { // replace macros and compile regexps for productions. replaceMacro := func(s string) string { return "(?:" + macros[s[1:len(s)-1]] + ")" } for t, s := range productions { for macroRegexp.MatchString(s) { s = macroRegexp.ReplaceAllStringFunc(s, replaceMacro) } matchers[t] = regexp.MustCompile("^(?:" + s + ")") } } // Scanner -------------------------------------------------------------------- // New returns a new CSS scanner for the given input. func New(input string) *Scanner { // Normalize newlines. // https://www.w3.org/TR/css-syntax-3/#input-preprocessing input = strings.Replace(input, "\r\n", "\n", -1) input = strings.Replace(input, "\r", "\n", -1) input = strings.Replace(input, "\f", "\n", -1) input = strings.Replace(input, "\u0000", "\ufffd", -1) return &Scanner{ input: input, row: 1, col: 1, } } // Scanner scans an input and emits tokens following the CSS3 specification. type Scanner struct { input string pos int row int col int err *Token } // Next returns the next token from the input. // // At the end of the input the token type is TokenEOF. // // If the input can't be tokenized the token type is TokenError. This occurs // in case of unclosed quotation marks or comments. func (s *Scanner) Next() *Token { if s.err != nil { return s.err } if s.pos >= len(s.input) { s.err = &Token{TokenEOF, "", s.row, s.col} return s.err } if s.pos == 0 { // Test BOM only once, at the beginning of the file. if strings.HasPrefix(s.input, "\uFEFF") { return s.emitSimple(TokenBOM, "\uFEFF") } } // There's a lot we can guess based on the first byte so we'll take a // shortcut before testing multiple regexps. input := s.input[s.pos:] switch input[0] { case '\t', '\n', ' ': // Whitespace. return s.emitToken(TokenS, matchers[TokenS].FindString(input)) case '.': // Dot is too common to not have a quick check. // We'll test if this is a Char; if it is followed by a number it is a // dimension/percentage/number, and this will be matched later. if len(input) > 1 && !unicode.IsDigit(rune(input[1])) { return s.emitSimple(TokenChar, ".") } case '#': // Another common one: Hash or Char. if match := matchers[TokenHash].FindString(input); match != "" { return s.emitToken(TokenHash, match) } return s.emitSimple(TokenChar, "#") case '@': // Another common one: AtKeyword or Char. if match := matchers[TokenAtKeyword].FindString(input); match != "" { return s.emitSimple(TokenAtKeyword, match) } return s.emitSimple(TokenChar, "@") case ':', ',', ';', '%', '&', '+', '=', '>', '(', ')', '[', ']', '{', '}': // More common chars. return s.emitSimple(TokenChar, string(input[0])) case '"', '\'': // String or error. match := matchers[TokenString].FindString(input) if match != "" { return s.emitToken(TokenString, match) } s.err = &Token{TokenError, "unclosed quotation mark", s.row, s.col} return s.err case '/': // Comment, error or Char. if len(input) > 1 && input[1] == '*' { match := matchers[TokenComment].FindString(input) if match != "" { return s.emitToken(TokenComment, match) } else { s.err = &Token{TokenError, "unclosed comment", s.row, s.col} return s.err } } return s.emitSimple(TokenChar, "/") case '~': // Includes or Char. return s.emitPrefixOrChar(TokenIncludes, "~=") case '|': // DashMatch or Char. return s.emitPrefixOrChar(TokenDashMatch, "|=") case '^': // PrefixMatch or Char. return s.emitPrefixOrChar(TokenPrefixMatch, "^=") case '$': // SuffixMatch or Char. return s.emitPrefixOrChar(TokenSuffixMatch, "$=") case '*': // SubstringMatch or Char. return s.emitPrefixOrChar(TokenSubstringMatch, "*=") case '<': // CDO or Char. return s.emitPrefixOrChar(TokenCDO, "<!--") } // Test all regexps, in order. for _, token := range matchOrder { if match := matchers[token].FindString(input); match != "" { return s.emitToken(token, match) } } // We already handled unclosed quotation marks and comments, // so this can only be a Char. r, width := utf8.DecodeRuneInString(input) token := &Token{TokenChar, string(r), s.row, s.col} s.col += width s.pos += width return token } // updatePosition updates input coordinates based on the consumed text. func (s *Scanner) updatePosition(text string) { width := utf8.RuneCountInString(text) lines := strings.Count(text, "\n") s.row += lines if lines == 0 { s.col += width } else { s.col = utf8.RuneCountInString(text[strings.LastIndex(text, "\n"):]) } s.pos += len(text) // while col is a rune index, pos is a byte index } // emitToken returns a Token for the string v and updates the scanner position. func (s *Scanner) emitToken(t tokenType, v string) *Token { token := &Token{t, v, s.row, s.col} s.updatePosition(v) return token } // emitSimple returns a Token for the string v and updates the scanner // position in a simplified manner. // // The string is known to have only ASCII characters and to not have a newline. func (s *Scanner) emitSimple(t tokenType, v string) *Token { token := &Token{t, v, s.row, s.col} s.col += len(v) s.pos += len(v) return token } // emitPrefixOrChar returns a Token for type t if the current position // matches the given prefix. Otherwise it returns a Char token using the // first character from the prefix. // // The prefix is known to have only ASCII characters and to not have a newline. func (s *Scanner) emitPrefixOrChar(t tokenType, prefix string) *Token { if strings.HasPrefix(s.input[s.pos:], prefix) { return s.emitSimple(t, prefix) } return s.emitSimple(TokenChar, string(prefix[0])) }