lexical.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571572573574575576577578579580581582583584585586587588589590591592593594595596597598599600601602603604605606607608609610611612613614615616617618619620621622623624625626627628629630631632633634635636637638639640641642643644645646647648649650651652653654655656657658659660661662663664665666667668669670671672673674675676677678679680681682683684685686687688689690691692693694695696697698699700701702703704705706707708709710711712713714
  1. package xsql
  2. import (
  3. "bufio"
  4. "bytes"
  5. "io"
  6. "strconv"
  7. "strings"
  8. )
  9. type Token int
  10. const (
  11. // Special tokens
  12. ILLEGAL Token = iota
  13. EOF
  14. WS
  15. COMMENT
  16. AS
  17. // Literals
  18. IDENT // main
  19. INTEGER // 12345
  20. NUMBER //12345.67
  21. STRING // "abc"
  22. BADSTRING // "abc
  23. operatorBeg
  24. // ADD and the following are InfluxQL Operators
  25. ADD // +
  26. SUB // -
  27. MUL // *
  28. DIV // /
  29. MOD // %
  30. BITWISE_AND // &
  31. BITWISE_OR // |
  32. BITWISE_XOR // ^
  33. AND // AND
  34. OR // OR
  35. EQ // =
  36. NEQ // !=
  37. LT // <
  38. LTE // <=
  39. GT // >
  40. GTE // >=
  41. SUBSET //[
  42. ARROW //->
  43. operatorEnd
  44. // Misc characters
  45. ASTERISK // *
  46. COMMA // ,
  47. LPAREN // (
  48. RPAREN // )
  49. LBRACKET //[
  50. RBRACKET //]
  51. HASH // #
  52. DOT // .
  53. COLON //:
  54. SEMICOLON //;
  55. COLSEP //\007
  56. // Keywords
  57. SELECT
  58. FROM
  59. JOIN
  60. INNER
  61. LEFT
  62. RIGHT
  63. FULL
  64. CROSS
  65. ON
  66. WHERE
  67. GROUP
  68. ORDER
  69. HAVING
  70. BY
  71. ASC
  72. DESC
  73. FILTER
  74. TRUE
  75. FALSE
  76. CREATE
  77. DROP
  78. EXPLAIN
  79. DESCRIBE
  80. SHOW
  81. STREAM
  82. STREAMS
  83. WITH
  84. XBIGINT
  85. XFLOAT
  86. XSTRING
  87. XBYTEA
  88. XDATETIME
  89. XBOOLEAN
  90. XARRAY
  91. XSTRUCT
  92. DATASOURCE
  93. KEY
  94. FORMAT
  95. CONF_KEY
  96. TYPE
  97. STRICT_VALIDATION
  98. TIMESTAMP
  99. TIMESTAMP_FORMAT
  100. DD
  101. HH
  102. MI
  103. SS
  104. MS
  105. )
  106. var tokens = []string{
  107. ILLEGAL: "ILLEGAL",
  108. EOF: "EOF",
  109. AS: "AS",
  110. WS: "WS",
  111. IDENT: "IDENT",
  112. INTEGER: "INTEGER",
  113. NUMBER: "NUMBER",
  114. STRING: "STRING",
  115. ADD: "+",
  116. SUB: "-",
  117. MUL: "*",
  118. DIV: "/",
  119. MOD: "%",
  120. BITWISE_AND: "&",
  121. BITWISE_OR: "|",
  122. BITWISE_XOR: "^",
  123. EQ: "=",
  124. NEQ: "!=",
  125. LT: "<",
  126. LTE: "<=",
  127. GT: ">",
  128. GTE: ">=",
  129. SUBSET: "[]",
  130. ARROW: "->",
  131. ASTERISK: "*",
  132. COMMA: ",",
  133. LPAREN: "(",
  134. RPAREN: ")",
  135. LBRACKET: "[",
  136. RBRACKET: "]",
  137. HASH: "#",
  138. DOT: ".",
  139. SEMICOLON: ";",
  140. COLON: ":",
  141. COLSEP: "\007",
  142. SELECT: "SELECT",
  143. FROM: "FROM",
  144. JOIN: "JOIN",
  145. LEFT: "LEFT",
  146. INNER: "INNER",
  147. ON: "ON",
  148. WHERE: "WHERE",
  149. GROUP: "GROUP",
  150. ORDER: "ORDER",
  151. HAVING: "HAVING",
  152. BY: "BY",
  153. ASC: "ASC",
  154. DESC: "DESC",
  155. CREATE: "CREATE",
  156. DROP: "RROP",
  157. EXPLAIN: "EXPLAIN",
  158. DESCRIBE: "DESCRIBE",
  159. SHOW: "SHOW",
  160. STREAM: "STREAM",
  161. STREAMS: "STREAMS",
  162. WITH: "WITH",
  163. XBIGINT: "BIGINT",
  164. XFLOAT: "FLOAT",
  165. XSTRING: "STRING",
  166. XBYTEA: "BYTEA",
  167. XDATETIME: "DATETIME",
  168. XBOOLEAN: "BOOLEAN",
  169. XARRAY: "ARRAY",
  170. XSTRUCT: "STRUCT",
  171. DATASOURCE: "DATASOURCE",
  172. KEY: "KEY",
  173. FORMAT: "FORMAT",
  174. CONF_KEY: "CONF_KEY",
  175. TYPE: "TYPE",
  176. STRICT_VALIDATION: "STRICT_VALIDATION",
  177. TIMESTAMP: "TIMESTAMP",
  178. TIMESTAMP_FORMAT: "TIMESTAMP_FORMAT",
  179. AND: "AND",
  180. OR: "OR",
  181. TRUE: "TRUE",
  182. FALSE: "FALSE",
  183. DD: "DD",
  184. HH: "HH",
  185. MI: "MI",
  186. SS: "SS",
  187. MS: "MS",
  188. }
  189. func (tok Token) String() string {
  190. if tok >= 0 && tok < Token(len(tokens)) {
  191. return tokens[tok]
  192. }
  193. return ""
  194. }
  195. type Scanner struct {
  196. r *bufio.Reader
  197. }
  198. func NewScanner(r io.Reader) *Scanner {
  199. return &Scanner{r: bufio.NewReader(r)}
  200. }
  201. func (s *Scanner) Scan() (tok Token, lit string) {
  202. ch := s.read()
  203. if isWhiteSpace(ch) {
  204. //s.unread()
  205. return s.ScanWhiteSpace()
  206. } else if isLetter(ch) {
  207. s.unread()
  208. return s.ScanIdent()
  209. } else if isQuotation(ch) {
  210. s.unread()
  211. return s.ScanString()
  212. } else if isDigit(ch) {
  213. s.unread()
  214. return s.ScanNumber(false, false)
  215. } else if isBackquote(ch) {
  216. return s.ScanBackquoteIdent()
  217. }
  218. switch ch {
  219. case eof:
  220. return EOF, tokens[EOF]
  221. case '=':
  222. return EQ, tokens[EQ]
  223. case '!':
  224. _, _ = s.ScanWhiteSpace()
  225. if r := s.read(); r == '=' {
  226. return NEQ, tokens[NEQ]
  227. } else {
  228. s.unread()
  229. }
  230. return EQ, tokens[EQ]
  231. case '<':
  232. _, _ = s.ScanWhiteSpace()
  233. if r := s.read(); r == '=' {
  234. return LTE, tokens[LTE]
  235. } else {
  236. s.unread()
  237. }
  238. return LT, tokens[LT]
  239. case '>':
  240. _, _ = s.ScanWhiteSpace()
  241. if r := s.read(); r == '=' {
  242. return GTE, tokens[GTE]
  243. } else {
  244. s.unread()
  245. }
  246. return GT, tokens[GT]
  247. case '+':
  248. return ADD, tokens[ADD]
  249. case '-':
  250. _, _ = s.ScanWhiteSpace()
  251. if r := s.read(); r == '-' {
  252. s.skipUntilNewline()
  253. return COMMENT, ""
  254. } else if r == '>' {
  255. return ARROW, tokens[ARROW]
  256. } else if isDigit(r) {
  257. s.unread()
  258. return s.ScanNumber(false, true)
  259. } else if r == '.' {
  260. _, _ = s.ScanWhiteSpace()
  261. if r1 := s.read(); isDigit(r1) {
  262. s.unread()
  263. return s.ScanNumber(true, true)
  264. } else {
  265. s.unread()
  266. }
  267. s.unread()
  268. } else {
  269. s.unread()
  270. }
  271. return SUB, tokens[SUB]
  272. case '/':
  273. _, _ = s.ScanWhiteSpace()
  274. if r := s.read(); r == '*' {
  275. if err := s.skipUntilEndComment(); err != nil {
  276. return ILLEGAL, ""
  277. }
  278. return COMMENT, ""
  279. } else {
  280. s.unread()
  281. }
  282. return DIV, tokens[DIV]
  283. case '.':
  284. if r := s.read(); isDigit(r) {
  285. s.unread()
  286. return s.ScanNumber(true, false)
  287. }
  288. s.unread()
  289. return DOT, tokens[DOT]
  290. case '%':
  291. return MOD, tokens[MOD]
  292. case '&':
  293. return BITWISE_AND, tokens[BITWISE_AND]
  294. case '|':
  295. return BITWISE_OR, tokens[BITWISE_OR]
  296. case '^':
  297. return BITWISE_XOR, tokens[BITWISE_XOR]
  298. case '*':
  299. return ASTERISK, tokens[ASTERISK]
  300. case ',':
  301. return COMMA, tokens[COMMA]
  302. case '(':
  303. return LPAREN, tokens[LPAREN]
  304. case ')':
  305. return RPAREN, tokens[RPAREN]
  306. case '[':
  307. return LBRACKET, tokens[LBRACKET]
  308. case ']':
  309. return RBRACKET, tokens[RBRACKET]
  310. case ':':
  311. return COLON, tokens[COLON]
  312. case '#':
  313. return HASH, tokens[HASH]
  314. case ';':
  315. return SEMICOLON, tokens[SEMICOLON]
  316. }
  317. return ILLEGAL, ""
  318. }
  319. func (s *Scanner) ScanIdent() (tok Token, lit string) {
  320. var buf bytes.Buffer
  321. buf.WriteRune(s.read())
  322. for {
  323. if ch := s.read(); ch == eof {
  324. break
  325. } else if !isLetter(ch) && !isDigit(ch) && ch != '_' {
  326. s.unread()
  327. break
  328. } else {
  329. buf.WriteRune(ch)
  330. }
  331. }
  332. switch lit = strings.ToUpper(buf.String()); lit {
  333. case "SELECT":
  334. return SELECT, lit
  335. case "AS":
  336. return AS, lit
  337. case "FROM":
  338. return FROM, lit
  339. case "WHERE":
  340. return WHERE, lit
  341. case "AND":
  342. return AND, lit
  343. case "OR":
  344. return OR, lit
  345. case "GROUP":
  346. return GROUP, lit
  347. case "HAVING":
  348. return HAVING, lit
  349. case "ORDER":
  350. return ORDER, lit
  351. case "BY":
  352. return BY, lit
  353. case "DESC":
  354. return DESC, lit
  355. case "ASC":
  356. return ASC, lit
  357. case "FILTER":
  358. return FILTER, lit
  359. case "INNER":
  360. return INNER, lit
  361. case "LEFT":
  362. return LEFT, lit
  363. case "RIGHT":
  364. return RIGHT, lit
  365. case "FULL":
  366. return FULL, lit
  367. case "CROSS":
  368. return CROSS, lit
  369. case "JOIN":
  370. return JOIN, lit
  371. case "ON":
  372. return ON, lit
  373. case "CREATE":
  374. return CREATE, lit
  375. case "DROP":
  376. return DROP, lit
  377. case "EXPLAIN":
  378. return EXPLAIN, lit
  379. case "DESCRIBE":
  380. return DESCRIBE, lit
  381. case "SHOW":
  382. return SHOW, lit
  383. case "STREAM":
  384. return STREAM, lit
  385. case "STREAMS":
  386. return STREAMS, lit
  387. case "WITH":
  388. return WITH, lit
  389. case "BIGINT":
  390. return XBIGINT, lit
  391. case "FLOAT":
  392. return XFLOAT, lit
  393. case "DATETIME":
  394. return XDATETIME, lit
  395. case "STRING":
  396. return XSTRING, lit
  397. case "BYTEA":
  398. return XBYTEA, lit
  399. case "BOOLEAN":
  400. return XBOOLEAN, lit
  401. case "ARRAY":
  402. return XARRAY, lit
  403. case "STRUCT":
  404. return XSTRUCT, lit
  405. case "DATASOURCE":
  406. return DATASOURCE, lit
  407. case "KEY":
  408. return KEY, lit
  409. case "FORMAT":
  410. return FORMAT, lit
  411. case "CONF_KEY":
  412. return CONF_KEY, lit
  413. case "TYPE":
  414. return TYPE, lit
  415. case "TRUE":
  416. return TRUE, lit
  417. case "FALSE":
  418. return FALSE, lit
  419. case "STRICT_VALIDATION":
  420. return STRICT_VALIDATION, lit
  421. case "TIMESTAMP":
  422. return TIMESTAMP, lit
  423. case "TIMESTAMP_FORMAT":
  424. return TIMESTAMP_FORMAT, lit
  425. case "DD":
  426. return DD, lit
  427. case "HH":
  428. return HH, lit
  429. case "MI":
  430. return MI, lit
  431. case "SS":
  432. return SS, lit
  433. case "MS":
  434. return MS, lit
  435. }
  436. return IDENT, buf.String()
  437. }
  438. func (s *Scanner) ScanString() (tok Token, lit string) {
  439. var buf bytes.Buffer
  440. ch := s.read()
  441. buf.WriteRune(ch)
  442. escape := false
  443. for {
  444. ch = s.read()
  445. if ch == '"' && !escape {
  446. buf.WriteRune(ch)
  447. break
  448. } else if ch == eof {
  449. return BADSTRING, buf.String()
  450. } else if ch == '\\' && !escape {
  451. escape = true
  452. buf.WriteRune(ch)
  453. } else {
  454. escape = false
  455. buf.WriteRune(ch)
  456. }
  457. }
  458. r, _ := strconv.Unquote(buf.String())
  459. return STRING, r
  460. }
  461. func (s *Scanner) ScanDigit() (tok Token, lit string) {
  462. var buf bytes.Buffer
  463. ch := s.read()
  464. buf.WriteRune(ch)
  465. for {
  466. if ch := s.read(); isDigit(ch) {
  467. buf.WriteRune(ch)
  468. } else {
  469. s.unread()
  470. break
  471. }
  472. }
  473. return INTEGER, buf.String()
  474. }
  475. func (s *Scanner) ScanNumber(startWithDot bool, isNeg bool) (tok Token, lit string) {
  476. var buf bytes.Buffer
  477. if isNeg {
  478. buf.WriteRune('-')
  479. }
  480. if startWithDot {
  481. buf.WriteRune('.')
  482. }
  483. ch := s.read()
  484. buf.WriteRune(ch)
  485. isNum := false
  486. for {
  487. if ch := s.read(); isDigit(ch) {
  488. buf.WriteRune(ch)
  489. } else if ch == '.' {
  490. isNum = true
  491. buf.WriteRune(ch)
  492. } else {
  493. s.unread()
  494. break
  495. }
  496. }
  497. if isNum || startWithDot {
  498. return NUMBER, buf.String()
  499. } else {
  500. return INTEGER, buf.String()
  501. }
  502. }
  503. func (s *Scanner) ScanBackquoteIdent() (tok Token, lit string) {
  504. var buf bytes.Buffer
  505. for {
  506. ch := s.read()
  507. if isBackquote(ch) || ch == eof {
  508. break
  509. } else {
  510. buf.WriteRune(ch)
  511. }
  512. }
  513. return IDENT, buf.String()
  514. }
  515. func (s *Scanner) skipUntilNewline() {
  516. for {
  517. if ch := s.read(); ch == '\n' || ch == eof {
  518. return
  519. }
  520. }
  521. }
  522. func (s *Scanner) skipUntilEndComment() error {
  523. for {
  524. if ch1 := s.read(); ch1 == '*' {
  525. // We might be at the end.
  526. star:
  527. ch2 := s.read()
  528. if ch2 == '/' {
  529. return nil
  530. } else if ch2 == '*' {
  531. // We are back in the state machine since we see a star.
  532. goto star
  533. } else if ch2 == eof {
  534. return io.EOF
  535. }
  536. } else if ch1 == eof {
  537. return io.EOF
  538. }
  539. }
  540. }
  541. func (s *Scanner) ScanWhiteSpace() (tok Token, lit string) {
  542. var buf bytes.Buffer
  543. for {
  544. if ch := s.read(); ch == eof {
  545. break
  546. } else if !isWhiteSpace(ch) {
  547. s.unread()
  548. break
  549. } else {
  550. buf.WriteRune(ch)
  551. }
  552. }
  553. return WS, buf.String()
  554. }
  555. func (s *Scanner) read() rune {
  556. ch, _, err := s.r.ReadRune()
  557. if err != nil {
  558. return eof
  559. }
  560. return ch
  561. }
  562. func (s *Scanner) unread() {
  563. _ = s.r.UnreadRune()
  564. }
  565. var eof = rune(0)
  566. func isWhiteSpace(r rune) bool {
  567. return (r == ' ') || (r == '\t') || (r == '\r') || (r == '\n')
  568. }
  569. func isLetter(ch rune) bool { return (ch >= 'a' && ch <= 'z') || (ch >= 'A' && ch <= 'Z') }
  570. func isDigit(ch rune) bool { return ch >= '0' && ch <= '9' }
  571. func isQuotation(ch rune) bool { return ch == '"' }
  572. func isBackquote(ch rune) bool { return ch == '`' }
  573. func (tok Token) isOperator() bool {
  574. return (tok > operatorBeg && tok < operatorEnd) || tok == ASTERISK || tok == LBRACKET
  575. }
  576. func (tok Token) isTimeLiteral() bool { return tok >= DD && tok <= MS }
  577. func (tok Token) allowedSourceToken() bool {
  578. return tok == IDENT || tok == DIV || tok == HASH || tok == ADD
  579. }
  580. //Allowed special field name token
  581. func (tok Token) allowedSFNToken() bool { return tok == DOT }
  582. func (tok Token) Precedence() int {
  583. switch tok {
  584. case OR:
  585. return 1
  586. case AND:
  587. return 2
  588. case EQ, NEQ, LT, LTE, GT, GTE:
  589. return 3
  590. case ADD, SUB, BITWISE_OR, BITWISE_XOR:
  591. return 4
  592. case MUL, DIV, MOD, BITWISE_AND, SUBSET, ARROW:
  593. return 5
  594. }
  595. return 0
  596. }
  597. type DataType int
  598. const (
  599. UNKNOWN DataType = iota
  600. BIGINT
  601. FLOAT
  602. STRINGS
  603. BYTEA
  604. DATETIME
  605. BOOLEAN
  606. ARRAY
  607. STRUCT
  608. )
  609. var dataTypes = []string{
  610. BIGINT: "bigint",
  611. FLOAT: "float",
  612. STRINGS: "string",
  613. DATETIME: "datetime",
  614. BOOLEAN: "boolean",
  615. ARRAY: "array",
  616. STRUCT: "struct",
  617. }
  618. func (d DataType) isSimpleType() bool {
  619. return d >= BIGINT && d <= BOOLEAN
  620. }
  621. func (d DataType) String() string {
  622. if d >= 0 && d < DataType(len(dataTypes)) {
  623. return dataTypes[d]
  624. }
  625. return ""
  626. }
  627. func getDataType(tok Token) DataType {
  628. switch tok {
  629. case XBIGINT:
  630. return BIGINT
  631. case XFLOAT:
  632. return FLOAT
  633. case XSTRING:
  634. return STRINGS
  635. case XBYTEA:
  636. return BYTEA
  637. case XDATETIME:
  638. return DATETIME
  639. case XBOOLEAN:
  640. return BOOLEAN
  641. case XARRAY:
  642. return ARRAY
  643. case XSTRUCT:
  644. return STRUCT
  645. }
  646. return UNKNOWN
  647. }