lexer.go 7.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421
  1. package pongo2
  2. import (
  3. "fmt"
  4. "strings"
  5. "unicode/utf8"
  6. )
  7. const (
  8. TokenError = iota
  9. EOF
  10. TokenHTML
  11. TokenKeyword
  12. TokenIdentifier
  13. TokenString
  14. TokenNumber
  15. TokenSymbol
  16. )
  17. var (
  18. tokenSpaceChars = " \n\r\t"
  19. tokenIdentifierChars = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_"
  20. tokenIdentifierCharsWithDigits = "abcdefghijklmnopqrstuvwxyzABCDEFGHIJKLMNOPQRSTUVWXYZ_0123456789"
  21. tokenDigits = "0123456789"
  22. // Available symbols in pongo2 (within filters/tag)
  23. TokenSymbols = []string{
  24. // 3-Char symbols
  25. // 2-Char symbols
  26. "==", ">=", "<=", "&&", "||", "{{", "}}", "{%", "%}", "!=", "<>",
  27. // 1-Char symbol
  28. "(", ")", "+", "-", "*", "<", ">", "/", "^", ",", ".", "!", "|", ":", "=", "%",
  29. }
  30. // Available keywords in pongo2
  31. TokenKeywords = []string{"in", "and", "or", "not", "true", "false", "as", "export"}
  32. )
  33. type TokenType int
  34. type Token struct {
  35. Filename string
  36. Typ TokenType
  37. Val string
  38. Line int
  39. Col int
  40. }
  41. type lexerStateFn func() lexerStateFn
  42. type lexer struct {
  43. name string
  44. input string
  45. start int // start pos of the item
  46. pos int // current pos
  47. width int // width of last rune
  48. tokens []*Token
  49. errored bool
  50. startline int
  51. startcol int
  52. line int
  53. col int
  54. in_verbatim bool
  55. verbatim_name string
  56. }
  57. func (t *Token) String() string {
  58. val := t.Val
  59. if len(val) > 1000 {
  60. val = fmt.Sprintf("%s...%s", val[:10], val[len(val)-5:len(val)])
  61. }
  62. typ := ""
  63. switch t.Typ {
  64. case TokenHTML:
  65. typ = "HTML"
  66. case TokenError:
  67. typ = "Error"
  68. case TokenIdentifier:
  69. typ = "Identifier"
  70. case TokenKeyword:
  71. typ = "Keyword"
  72. case TokenNumber:
  73. typ = "Number"
  74. case TokenString:
  75. typ = "String"
  76. case TokenSymbol:
  77. typ = "Symbol"
  78. default:
  79. typ = "Unknown"
  80. }
  81. return fmt.Sprintf("<Token Typ=%s (%d) Val='%s' Line=%d Col=%d>",
  82. typ, t.Typ, val, t.Line, t.Col)
  83. }
  84. func lex(name string, input string) ([]*Token, *Error) {
  85. l := &lexer{
  86. name: name,
  87. input: input,
  88. tokens: make([]*Token, 0, 100),
  89. line: 1,
  90. col: 1,
  91. startline: 1,
  92. startcol: 1,
  93. }
  94. l.run()
  95. if l.errored {
  96. errtoken := l.tokens[len(l.tokens)-1]
  97. return nil, &Error{
  98. Filename: name,
  99. Line: errtoken.Line,
  100. Column: errtoken.Col,
  101. Sender: "lexer",
  102. ErrorMsg: errtoken.Val,
  103. }
  104. }
  105. return l.tokens, nil
  106. }
  107. func (l *lexer) value() string {
  108. return l.input[l.start:l.pos]
  109. }
  110. func (l *lexer) length() int {
  111. return l.pos - l.start
  112. }
  113. func (l *lexer) emit(t TokenType) {
  114. tok := &Token{
  115. Filename: l.name,
  116. Typ: t,
  117. Val: l.value(),
  118. Line: l.startline,
  119. Col: l.startcol,
  120. }
  121. if t == TokenString {
  122. // Escape sequence \" in strings
  123. tok.Val = strings.Replace(tok.Val, `\"`, `"`, -1)
  124. tok.Val = strings.Replace(tok.Val, `\\`, `\`, -1)
  125. }
  126. l.tokens = append(l.tokens, tok)
  127. l.start = l.pos
  128. l.startline = l.line
  129. l.startcol = l.col
  130. }
  131. func (l *lexer) next() rune {
  132. if l.pos >= len(l.input) {
  133. l.width = 0
  134. return EOF
  135. }
  136. r, w := utf8.DecodeRuneInString(l.input[l.pos:])
  137. l.width = w
  138. l.pos += l.width
  139. l.col += l.width
  140. return r
  141. }
  142. func (l *lexer) backup() {
  143. l.pos -= l.width
  144. l.col -= l.width
  145. }
  146. func (l *lexer) peek() rune {
  147. r := l.next()
  148. l.backup()
  149. return r
  150. }
  151. func (l *lexer) ignore() {
  152. l.start = l.pos
  153. l.startline = l.line
  154. l.startcol = l.col
  155. }
  156. func (l *lexer) accept(what string) bool {
  157. if strings.IndexRune(what, l.next()) >= 0 {
  158. return true
  159. }
  160. l.backup()
  161. return false
  162. }
  163. func (l *lexer) acceptRun(what string) {
  164. for strings.IndexRune(what, l.next()) >= 0 {
  165. }
  166. l.backup()
  167. }
  168. func (l *lexer) errorf(format string, args ...interface{}) lexerStateFn {
  169. t := &Token{
  170. Filename: l.name,
  171. Typ: TokenError,
  172. Val: fmt.Sprintf(format, args...),
  173. Line: l.startline,
  174. Col: l.startcol,
  175. }
  176. l.tokens = append(l.tokens, t)
  177. l.errored = true
  178. l.startline = l.line
  179. l.startcol = l.col
  180. return nil
  181. }
  182. func (l *lexer) eof() bool {
  183. return l.start >= len(l.input)-1
  184. }
  185. func (l *lexer) run() {
  186. for {
  187. // TODO: Support verbatim tag names
  188. // https://docs.djangoproject.com/en/dev/ref/templates/builtins/#verbatim
  189. if l.in_verbatim {
  190. name := l.verbatim_name
  191. if name != "" {
  192. name += " "
  193. }
  194. if strings.HasPrefix(l.input[l.pos:], fmt.Sprintf("{%% endverbatim %s%%}", name)) { // end verbatim
  195. if l.pos > l.start {
  196. l.emit(TokenHTML)
  197. }
  198. w := len("{% endverbatim %}")
  199. l.pos += w
  200. l.col += w
  201. l.ignore()
  202. l.in_verbatim = false
  203. }
  204. } else if strings.HasPrefix(l.input[l.pos:], "{% verbatim %}") { // tag
  205. if l.pos > l.start {
  206. l.emit(TokenHTML)
  207. }
  208. l.in_verbatim = true
  209. w := len("{% verbatim %}")
  210. l.pos += w
  211. l.col += w
  212. l.ignore()
  213. }
  214. if !l.in_verbatim {
  215. // Ignore single-line comments {# ... #}
  216. if strings.HasPrefix(l.input[l.pos:], "{#") {
  217. if l.pos > l.start {
  218. l.emit(TokenHTML)
  219. }
  220. l.pos += 2 // pass '{#'
  221. l.col += 2
  222. for {
  223. switch l.peek() {
  224. case EOF:
  225. l.errorf("Single-line comment not closed.")
  226. return
  227. case '\n':
  228. l.errorf("Newline not permitted in a single-line comment.")
  229. return
  230. }
  231. if strings.HasPrefix(l.input[l.pos:], "#}") {
  232. l.pos += 2 // pass '#}'
  233. l.col += 2
  234. break
  235. }
  236. l.next()
  237. }
  238. l.ignore() // ignore whole comment
  239. // Comment skipped
  240. continue // next token
  241. }
  242. if strings.HasPrefix(l.input[l.pos:], "{{") || // variable
  243. strings.HasPrefix(l.input[l.pos:], "{%") { // tag
  244. if l.pos > l.start {
  245. l.emit(TokenHTML)
  246. }
  247. l.tokenize()
  248. if l.errored {
  249. return
  250. }
  251. continue
  252. }
  253. }
  254. switch l.peek() {
  255. case '\n':
  256. l.line++
  257. l.col = 0
  258. }
  259. if l.next() == EOF {
  260. break
  261. }
  262. }
  263. if l.pos > l.start {
  264. l.emit(TokenHTML)
  265. }
  266. if l.in_verbatim {
  267. l.errorf("verbatim-tag not closed, got EOF.")
  268. }
  269. }
  270. func (l *lexer) tokenize() {
  271. for state := l.stateCode; state != nil; {
  272. state = state()
  273. }
  274. }
  275. func (l *lexer) stateCode() lexerStateFn {
  276. outer_loop:
  277. for {
  278. switch {
  279. case l.accept(tokenSpaceChars):
  280. if l.value() == "\n" {
  281. return l.errorf("Newline not allowed within tag/variable.")
  282. }
  283. l.ignore()
  284. continue
  285. case l.accept(tokenIdentifierChars):
  286. return l.stateIdentifier
  287. case l.accept(tokenDigits):
  288. return l.stateNumber
  289. case l.accept(`"`):
  290. return l.stateString
  291. }
  292. // Check for symbol
  293. for _, sym := range TokenSymbols {
  294. if strings.HasPrefix(l.input[l.start:], sym) {
  295. l.pos += len(sym)
  296. l.col += l.length()
  297. l.emit(TokenSymbol)
  298. if sym == "%}" || sym == "}}" {
  299. // Tag/variable end, return after emit
  300. return nil
  301. }
  302. continue outer_loop
  303. }
  304. }
  305. if l.pos < len(l.input) {
  306. return l.errorf("Unknown character: %q (%d)", l.peek(), l.peek())
  307. }
  308. break
  309. }
  310. // Normal shut down
  311. return nil
  312. }
  313. func (l *lexer) stateIdentifier() lexerStateFn {
  314. l.acceptRun(tokenIdentifierChars)
  315. l.acceptRun(tokenIdentifierCharsWithDigits)
  316. for _, kw := range TokenKeywords {
  317. if kw == l.value() {
  318. l.emit(TokenKeyword)
  319. return l.stateCode
  320. }
  321. }
  322. l.emit(TokenIdentifier)
  323. return l.stateCode
  324. }
  325. func (l *lexer) stateNumber() lexerStateFn {
  326. l.acceptRun(tokenDigits)
  327. /*
  328. Maybe context-sensitive number lexing?
  329. * comments.0.Text // first comment
  330. * usercomments.1.0 // second user, first comment
  331. * if (score >= 8.5) // 8.5 as a number
  332. if l.peek() == '.' {
  333. l.accept(".")
  334. if !l.accept(tokenDigits) {
  335. return l.errorf("Malformed number.")
  336. }
  337. l.acceptRun(tokenDigits)
  338. }
  339. */
  340. l.emit(TokenNumber)
  341. return l.stateCode
  342. }
  343. func (l *lexer) stateString() lexerStateFn {
  344. l.ignore()
  345. l.startcol -= 1 // we're starting the position at the first "
  346. for !l.accept(`"`) {
  347. switch l.next() {
  348. case '\\':
  349. // escape sequence
  350. switch l.peek() {
  351. case '"', '\\':
  352. l.next()
  353. default:
  354. return l.errorf("Unknown escape sequence: \\%c", l.peek())
  355. }
  356. case EOF:
  357. return l.errorf("Unexpected EOF, string not closed.")
  358. case '\n':
  359. return l.errorf("Newline in string is not allowed.")
  360. }
  361. }
  362. l.backup()
  363. l.emit(TokenString)
  364. l.next()
  365. l.ignore()
  366. return l.stateCode
  367. }