parser.go 9.9 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436
  1. package mark
  2. import (
  3. "regexp"
  4. "strings"
  5. "unicode"
  6. "unicode/utf8"
  7. )
  8. // parse holds the state of the parser.
  9. type parse struct {
  10. Nodes []Node
  11. lex Lexer
  12. options *Options
  13. tr *parse
  14. output string
  15. peekCount int
  16. token [3]item // three-token lookahead for parser
  17. links map[string]*DefLinkNode // Deflink parsing, used RefLinks
  18. renderFn map[NodeType]RenderFn // Custom overridden fns
  19. }
  20. // Return new parser
  21. func newParse(input string, opts *Options) *parse {
  22. return &parse{
  23. lex: lex(input),
  24. options: opts,
  25. links: make(map[string]*DefLinkNode),
  26. renderFn: make(map[NodeType]RenderFn),
  27. }
  28. }
  29. // parse convert the raw text to Nodeparse.
  30. func (p *parse) parse() {
  31. Loop:
  32. for {
  33. var n Node
  34. switch t := p.peek(); t.typ {
  35. case itemEOF, itemError:
  36. break Loop
  37. case itemNewLine:
  38. p.next()
  39. case itemHr:
  40. n = p.newHr(p.next().pos)
  41. case itemHTML:
  42. t = p.next()
  43. n = p.newHTML(t.pos, t.val)
  44. case itemDefLink:
  45. n = p.parseDefLink()
  46. case itemHeading, itemLHeading:
  47. n = p.parseHeading()
  48. case itemCodeBlock, itemGfmCodeBlock:
  49. n = p.parseCodeBlock()
  50. case itemList:
  51. n = p.parseList()
  52. case itemTable, itemLpTable:
  53. n = p.parseTable()
  54. case itemBlockQuote:
  55. n = p.parseBlockQuote()
  56. case itemIndent:
  57. space := p.next()
  58. // If it isn't followed by itemText
  59. if p.peek().typ != itemText {
  60. continue
  61. }
  62. p.backup2(space)
  63. fallthrough
  64. // itemText
  65. default:
  66. tmp := p.newParagraph(t.pos)
  67. tmp.Nodes = p.parseText(p.next().val + p.scanLines())
  68. n = tmp
  69. }
  70. if n != nil {
  71. p.append(n)
  72. }
  73. }
  74. }
  75. // Root getter
  76. func (p *parse) root() *parse {
  77. if p.tr == nil {
  78. return p
  79. }
  80. return p.tr.root()
  81. }
  82. // Render parse nodes to the wanted output
  83. func (p *parse) render() {
  84. var output string
  85. for i, node := range p.Nodes {
  86. // If there's a custom render function, use it instead.
  87. if fn, ok := p.renderFn[node.Type()]; ok {
  88. output = fn(node)
  89. } else {
  90. output = node.Render()
  91. }
  92. p.output += output
  93. if output != "" && i != len(p.Nodes)-1 {
  94. p.output += "\n"
  95. }
  96. }
  97. }
  98. // append new node to nodes-list
  99. func (p *parse) append(n Node) {
  100. p.Nodes = append(p.Nodes, n)
  101. }
  102. // next returns the next token
  103. func (p *parse) next() item {
  104. if p.peekCount > 0 {
  105. p.peekCount--
  106. } else {
  107. p.token[0] = p.lex.nextItem()
  108. }
  109. return p.token[p.peekCount]
  110. }
  111. // peek returns but does not consume the next token.
  112. func (p *parse) peek() item {
  113. if p.peekCount > 0 {
  114. return p.token[p.peekCount-1]
  115. }
  116. p.peekCount = 1
  117. p.token[0] = p.lex.nextItem()
  118. return p.token[0]
  119. }
  120. // backup backs the input stream tp one token
  121. func (p *parse) backup() {
  122. p.peekCount++
  123. }
  124. // backup2 backs the input stream up two tokens.
  125. // The zeroth token is already there.
  126. func (p *parse) backup2(t1 item) {
  127. p.token[1] = t1
  128. p.peekCount = 2
  129. }
  130. // parseText
  131. func (p *parse) parseText(input string) (nodes []Node) {
  132. // Trim whitespaces that not a line-break
  133. input = regexp.MustCompile(`(?m)^ +| +(\n|$)`).ReplaceAllStringFunc(input, func(s string) string {
  134. if reBr.MatchString(s) {
  135. return s
  136. }
  137. return strings.Replace(s, " ", "", -1)
  138. })
  139. l := lexInline(input)
  140. for token := range l.items {
  141. var node Node
  142. switch token.typ {
  143. case itemBr:
  144. node = p.newBr(token.pos)
  145. case itemStrong, itemItalic, itemStrike, itemCode:
  146. node = p.parseEmphasis(token.typ, token.pos, token.val)
  147. case itemLink, itemAutoLink, itemGfmLink:
  148. var title, href string
  149. var text []Node
  150. if token.typ == itemLink {
  151. match := reLink.FindStringSubmatch(token.val)
  152. text = p.parseText(match[1])
  153. href, title = match[2], match[3]
  154. } else {
  155. var match []string
  156. if token.typ == itemGfmLink {
  157. match = reGfmLink.FindStringSubmatch(token.val)
  158. } else {
  159. match = reAutoLink.FindStringSubmatch(token.val)
  160. }
  161. href = match[1]
  162. text = append(text, p.newText(token.pos, match[1]))
  163. }
  164. node = p.newLink(token.pos, title, href, text...)
  165. case itemImage:
  166. match := reImage.FindStringSubmatch(token.val)
  167. node = p.newImage(token.pos, match[3], match[2], match[1])
  168. case itemRefLink, itemRefImage:
  169. match := reRefLink.FindStringSubmatch(token.val)
  170. text, ref := match[1], match[2]
  171. if ref == "" {
  172. ref = text
  173. }
  174. if token.typ == itemRefLink {
  175. node = p.newRefLink(token.typ, token.pos, token.val, ref, p.parseText(text))
  176. } else {
  177. node = p.newRefImage(token.typ, token.pos, token.val, ref, text)
  178. }
  179. case itemHTML:
  180. node = p.newHTML(token.pos, token.val)
  181. default:
  182. node = p.newText(token.pos, token.val)
  183. }
  184. nodes = append(nodes, node)
  185. }
  186. return nodes
  187. }
  188. // parse inline emphasis
  189. func (p *parse) parseEmphasis(typ itemType, pos Pos, val string) *EmphasisNode {
  190. var re *regexp.Regexp
  191. switch typ {
  192. case itemStrike:
  193. re = reStrike
  194. case itemStrong:
  195. re = reStrong
  196. case itemCode:
  197. re = reCode
  198. case itemItalic:
  199. re = reItalic
  200. }
  201. node := p.newEmphasis(pos, typ)
  202. match := re.FindStringSubmatch(val)
  203. text := match[len(match)-1]
  204. if text == "" {
  205. text = match[1]
  206. }
  207. node.Nodes = p.parseText(text)
  208. return node
  209. }
  210. // parse heading block
  211. func (p *parse) parseHeading() (node *HeadingNode) {
  212. token := p.next()
  213. level := 1
  214. var text string
  215. if token.typ == itemHeading {
  216. match := reHeading.FindStringSubmatch(token.val)
  217. level, text = len(match[1]), match[2]
  218. } else {
  219. match := reLHeading.FindStringSubmatch(token.val)
  220. // using equal signs for first-level, and dashes for second-level.
  221. text = match[1]
  222. if match[2] == "-" {
  223. level = 2
  224. }
  225. }
  226. node = p.newHeading(token.pos, level, text)
  227. node.Nodes = p.parseText(text)
  228. return
  229. }
  230. func (p *parse) parseDefLink() *DefLinkNode {
  231. token := p.next()
  232. match := reDefLink.FindStringSubmatch(token.val)
  233. name := strings.ToLower(match[1])
  234. // name(lowercase), href, title
  235. n := p.newDefLink(token.pos, name, match[2], match[3])
  236. // store in links
  237. links := p.root().links
  238. if _, ok := links[name]; !ok {
  239. links[name] = n
  240. }
  241. return n
  242. }
  243. // parse codeBlock
  244. func (p *parse) parseCodeBlock() *CodeNode {
  245. var lang, text string
  246. token := p.next()
  247. if token.typ == itemGfmCodeBlock {
  248. codeStart := reGfmCode.FindStringSubmatch(token.val)
  249. lang = codeStart[3]
  250. text = token.val[len(codeStart[0]):]
  251. } else {
  252. text = reCodeBlock.trim(token.val, "")
  253. }
  254. return p.newCode(token.pos, lang, text)
  255. }
  256. func (p *parse) parseBlockQuote() (n *BlockQuoteNode) {
  257. token := p.next()
  258. // replacer
  259. re := regexp.MustCompile(`(?m)^ *> ?`)
  260. raw := re.ReplaceAllString(token.val, "")
  261. // TODO(a8m): doesn't work right now with defLink(inside the blockQuote)
  262. tr := &parse{lex: lex(raw), tr: p}
  263. tr.parse()
  264. n = p.newBlockQuote(token.pos)
  265. n.Nodes = tr.Nodes
  266. return
  267. }
  268. // parse list
  269. func (p *parse) parseList() *ListNode {
  270. token := p.next()
  271. list := p.newList(token.pos, isDigit(token.val))
  272. Loop:
  273. for {
  274. switch token = p.peek(); token.typ {
  275. case itemLooseItem, itemListItem:
  276. list.append(p.parseListItem())
  277. default:
  278. break Loop
  279. }
  280. }
  281. return list
  282. }
  283. // parse listItem
  284. func (p *parse) parseListItem() *ListItemNode {
  285. token := p.next()
  286. item := p.newListItem(token.pos)
  287. token.val = strings.TrimSpace(token.val)
  288. if p.isTaskItem(token.val) {
  289. item.Nodes = p.parseTaskItem(token)
  290. return item
  291. }
  292. tr := &parse{lex: lex(token.val), tr: p}
  293. tr.parse()
  294. for _, node := range tr.Nodes {
  295. // wrap with paragraph only when it's a loose item
  296. if n, ok := node.(*ParagraphNode); ok && token.typ == itemListItem {
  297. item.Nodes = append(item.Nodes, n.Nodes...)
  298. } else {
  299. item.append(node)
  300. }
  301. }
  302. return item
  303. }
  304. // parseTaskItem parses list item as a task item.
  305. func (p *parse) parseTaskItem(token item) []Node {
  306. checkbox := p.newCheckbox(token.pos, token.val[1] == 'x')
  307. token.val = strings.TrimSpace(token.val[3:])
  308. return append([]Node{checkbox}, p.parseText(token.val)...)
  309. }
  310. // isTaskItem tests if the given string is list task item.
  311. func (p *parse) isTaskItem(s string) bool {
  312. if len(s) < 5 || s[0] != '[' || (s[1] != 'x' && s[1] != ' ') || s[2] != ']' {
  313. return false
  314. }
  315. return "" != strings.TrimSpace(s[3:])
  316. }
  317. // parse table
  318. func (p *parse) parseTable() *TableNode {
  319. table := p.newTable(p.next().pos)
  320. // Align [ None, Left, Right, ... ]
  321. // Header [ Cells: [ ... ] ]
  322. // Data: [ Rows: [ Cells: [ ... ] ] ]
  323. rows := struct {
  324. Align []AlignType
  325. Header []item
  326. Cells [][]item
  327. }{}
  328. Loop:
  329. for i := 0; ; {
  330. switch token := p.next(); token.typ {
  331. case itemTableRow:
  332. i++
  333. if i > 2 {
  334. rows.Cells = append(rows.Cells, []item{})
  335. }
  336. case itemTableCell:
  337. // Header
  338. if i == 1 {
  339. rows.Header = append(rows.Header, token)
  340. // Alignment
  341. } else if i == 2 {
  342. rows.Align = append(rows.Align, parseAlign(token.val))
  343. // Data
  344. } else {
  345. pos := i - 3
  346. rows.Cells[pos] = append(rows.Cells[pos], token)
  347. }
  348. default:
  349. p.backup()
  350. break Loop
  351. }
  352. }
  353. // Tranform to nodes
  354. table.append(p.parseCells(Header, rows.Header, rows.Align))
  355. // Table body
  356. for _, row := range rows.Cells {
  357. table.append(p.parseCells(Data, row, rows.Align))
  358. }
  359. return table
  360. }
  361. // parse cells and return new row
  362. func (p *parse) parseCells(kind int, items []item, align []AlignType) *RowNode {
  363. var row *RowNode
  364. for i, item := range items {
  365. if i == 0 {
  366. row = p.newRow(item.pos)
  367. }
  368. cell := p.newCell(item.pos, kind, align[i])
  369. cell.Nodes = p.parseText(item.val)
  370. row.append(cell)
  371. }
  372. return row
  373. }
  374. // Used to consume lines(itemText) for a continues paragraphs
  375. func (p *parse) scanLines() (s string) {
  376. for {
  377. tkn := p.next()
  378. if tkn.typ == itemText || tkn.typ == itemIndent {
  379. s += tkn.val
  380. } else if tkn.typ == itemNewLine {
  381. if t := p.peek().typ; t != itemText && t != itemIndent {
  382. p.backup2(tkn)
  383. break
  384. }
  385. s += tkn.val
  386. } else {
  387. p.backup()
  388. break
  389. }
  390. }
  391. return
  392. }
  393. // get align-string and return the align type of it
  394. func parseAlign(s string) (typ AlignType) {
  395. sfx, pfx := strings.HasSuffix(s, ":"), strings.HasPrefix(s, ":")
  396. switch {
  397. case sfx && pfx:
  398. typ = Center
  399. case sfx:
  400. typ = Right
  401. case pfx:
  402. typ = Left
  403. }
  404. return
  405. }
  406. // test if given string is digit
  407. func isDigit(s string) bool {
  408. r, _ := utf8.DecodeRuneInString(s)
  409. return unicode.IsDigit(r)
  410. }