docx2md.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560
  1. // https://github.com/mattn/docx2md
  2. // License MIT
  3. package utils
  4. import (
  5. "archive/zip"
  6. "bytes"
  7. "encoding/base64"
  8. "encoding/xml"
  9. "errors"
  10. _ "flag"
  11. "fmt"
  12. "io"
  13. "io/ioutil"
  14. "log"
  15. "os"
  16. "path"
  17. "path/filepath"
  18. _ "runtime"
  19. "strconv"
  20. "strings"
  21. "github.com/mattn/go-runewidth"
  22. )
  23. // Relationship is
  24. type Relationship struct {
  25. Text string `xml:",chardata"`
  26. ID string `xml:"Id,attr"`
  27. Type string `xml:"Type,attr"`
  28. Target string `xml:"Target,attr"`
  29. TargetMode string `xml:"TargetMode,attr"`
  30. }
  31. // Relationships is
  32. type Relationships struct {
  33. XMLName xml.Name `xml:"Relationships"`
  34. Text string `xml:",chardata"`
  35. Xmlns string `xml:"xmlns,attr"`
  36. Relationship []Relationship `xml:"Relationship"`
  37. }
  38. // TextVal is
  39. type TextVal struct {
  40. Text string `xml:",chardata"`
  41. Val string `xml:"val,attr"`
  42. }
  43. // NumberingLvl is
  44. type NumberingLvl struct {
  45. Text string `xml:",chardata"`
  46. Ilvl string `xml:"ilvl,attr"`
  47. Tplc string `xml:"tplc,attr"`
  48. Tentative string `xml:"tentative,attr"`
  49. Start TextVal `xml:"start"`
  50. NumFmt TextVal `xml:"numFmt"`
  51. LvlText TextVal `xml:"lvlText"`
  52. LvlJc TextVal `xml:"lvlJc"`
  53. PPr struct {
  54. Text string `xml:",chardata"`
  55. Ind struct {
  56. Text string `xml:",chardata"`
  57. Left string `xml:"left,attr"`
  58. Hanging string `xml:"hanging,attr"`
  59. } `xml:"ind"`
  60. } `xml:"pPr"`
  61. RPr struct {
  62. Text string `xml:",chardata"`
  63. U struct {
  64. Text string `xml:",chardata"`
  65. Val string `xml:"val,attr"`
  66. } `xml:"u"`
  67. RFonts struct {
  68. Text string `xml:",chardata"`
  69. Hint string `xml:"hint,attr"`
  70. } `xml:"rFonts"`
  71. } `xml:"rPr"`
  72. }
  73. // Numbering is
  74. type Numbering struct {
  75. XMLName xml.Name `xml:"numbering"`
  76. Text string `xml:",chardata"`
  77. Wpc string `xml:"wpc,attr"`
  78. Cx string `xml:"cx,attr"`
  79. Cx1 string `xml:"cx1,attr"`
  80. Mc string `xml:"mc,attr"`
  81. O string `xml:"o,attr"`
  82. R string `xml:"r,attr"`
  83. M string `xml:"m,attr"`
  84. V string `xml:"v,attr"`
  85. Wp14 string `xml:"wp14,attr"`
  86. Wp string `xml:"wp,attr"`
  87. W10 string `xml:"w10,attr"`
  88. W string `xml:"w,attr"`
  89. W14 string `xml:"w14,attr"`
  90. W15 string `xml:"w15,attr"`
  91. W16se string `xml:"w16se,attr"`
  92. Wpg string `xml:"wpg,attr"`
  93. Wpi string `xml:"wpi,attr"`
  94. Wne string `xml:"wne,attr"`
  95. Wps string `xml:"wps,attr"`
  96. Ignorable string `xml:"Ignorable,attr"`
  97. AbstractNum []struct {
  98. Text string `xml:",chardata"`
  99. AbstractNumID string `xml:"abstractNumId,attr"`
  100. RestartNumberingAfterBreak string `xml:"restartNumberingAfterBreak,attr"`
  101. Nsid TextVal `xml:"nsid"`
  102. MultiLevelType TextVal `xml:"multiLevelType"`
  103. Tmpl TextVal `xml:"tmpl"`
  104. Lvl []NumberingLvl `xml:"lvl"`
  105. } `xml:"abstractNum"`
  106. Num []struct {
  107. Text string `xml:",chardata"`
  108. NumID string `xml:"numId,attr"`
  109. AbstractNumID TextVal `xml:"abstractNumId"`
  110. } `xml:"num"`
  111. }
  112. type file struct {
  113. rels Relationships
  114. num Numbering
  115. r *zip.ReadCloser
  116. embed bool
  117. list map[string]int
  118. name string
  119. }
  120. // Node is
  121. type Node struct {
  122. XMLName xml.Name
  123. Attrs []xml.Attr `xml:"-"`
  124. Content []byte `xml:",innerxml"`
  125. Nodes []Node `xml:",any"`
  126. }
  127. // UnmarshalXML is
  128. func (n *Node) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
  129. n.Attrs = start.Attr
  130. type node Node
  131. return d.DecodeElement((*node)(n), &start)
  132. }
  133. func escape(s, set string) string {
  134. replacer := []string{}
  135. for _, r := range []rune(set) {
  136. rs := string(r)
  137. replacer = append(replacer, rs, `\`+rs)
  138. }
  139. return strings.NewReplacer(replacer...).Replace(s)
  140. }
  141. func (zf *file) extract(rel *Relationship, w io.Writer) error {
  142. err := os.MkdirAll(filepath.Join("uploads", zf.name, filepath.Dir(rel.Target)), 0755)
  143. if err != nil {
  144. return err
  145. }
  146. for _, f := range zf.r.File {
  147. if f.Name != "word/"+rel.Target {
  148. continue
  149. }
  150. rc, err := f.Open()
  151. if err != nil {
  152. return err
  153. }
  154. defer rc.Close()
  155. b := make([]byte, f.UncompressedSize64)
  156. n, err := rc.Read(b)
  157. if err != nil && err != io.EOF {
  158. return err
  159. }
  160. if zf.embed {
  161. fmt.Fprintf(w, "![](data:image/png;base64,%s)",
  162. base64.StdEncoding.EncodeToString(b[:n]))
  163. } else {
  164. err = ioutil.WriteFile(filepath.Join("uploads", zf.name, rel.Target), b, 0644)
  165. if err != nil {
  166. return err
  167. }
  168. fmt.Fprintf(w, "![](%s)", "/"+filepath.Join("uploads", zf.name, escape(rel.Target, "()")))
  169. }
  170. break
  171. }
  172. return nil
  173. }
  174. func attr(attrs []xml.Attr, name string) (string, bool) {
  175. for _, attr := range attrs {
  176. if attr.Name.Local == name {
  177. return attr.Value, true
  178. }
  179. }
  180. return "", false
  181. }
  182. func (zf *file) walk(node *Node, w io.Writer) error {
  183. switch node.XMLName.Local {
  184. case "hyperlink":
  185. fmt.Fprint(w, "[")
  186. var cbuf bytes.Buffer
  187. for _, n := range node.Nodes {
  188. if err := zf.walk(&n, &cbuf); err != nil {
  189. return err
  190. }
  191. }
  192. fmt.Fprint(w, escape(cbuf.String(), "[]"))
  193. fmt.Fprint(w, "]")
  194. fmt.Fprint(w, "(")
  195. if id, ok := attr(node.Attrs, "id"); ok {
  196. for _, rel := range zf.rels.Relationship {
  197. if id == rel.ID {
  198. fmt.Fprint(w, escape(rel.Target, "()"))
  199. break
  200. }
  201. }
  202. }
  203. fmt.Fprint(w, ")")
  204. case "t":
  205. fmt.Fprint(w, string(node.Content))
  206. case "pPr":
  207. code := false
  208. for _, n := range node.Nodes {
  209. switch n.XMLName.Local {
  210. case "ind":
  211. if left, ok := attr(n.Attrs, "left"); ok {
  212. if i, err := strconv.Atoi(left); err == nil && i > 0 {
  213. fmt.Fprint(w, strings.Repeat(" ", i/360))
  214. }
  215. }
  216. case "pStyle":
  217. if val, ok := attr(n.Attrs, "val"); ok {
  218. if strings.HasPrefix(val, "Heading") {
  219. if i, err := strconv.Atoi(val[7:]); err == nil && i > 0 {
  220. fmt.Fprint(w, strings.Repeat("#", i)+" ")
  221. }
  222. } else if val == "Code" {
  223. code = true
  224. } else {
  225. if i, err := strconv.Atoi(val); err == nil && i > 0 {
  226. fmt.Fprint(w, strings.Repeat("#", i)+" ")
  227. }
  228. }
  229. }
  230. case "numPr":
  231. numID := ""
  232. ilvl := ""
  233. numFmt := ""
  234. start := 1
  235. ind := 0
  236. for _, nn := range n.Nodes {
  237. if nn.XMLName.Local == "numId" {
  238. if val, ok := attr(nn.Attrs, "val"); ok {
  239. numID = val
  240. }
  241. }
  242. if nn.XMLName.Local == "ilvl" {
  243. if val, ok := attr(nn.Attrs, "val"); ok {
  244. ilvl = val
  245. }
  246. }
  247. }
  248. for _, num := range zf.num.Num {
  249. if numID != num.NumID {
  250. continue
  251. }
  252. for _, abnum := range zf.num.AbstractNum {
  253. if abnum.AbstractNumID != num.AbstractNumID.Val {
  254. continue
  255. }
  256. for _, ablvl := range abnum.Lvl {
  257. if ablvl.Ilvl != ilvl {
  258. continue
  259. }
  260. if i, err := strconv.Atoi(ablvl.Start.Val); err == nil {
  261. start = i
  262. }
  263. if i, err := strconv.Atoi(ablvl.PPr.Ind.Left); err == nil {
  264. ind = i / 360
  265. }
  266. numFmt = ablvl.NumFmt.Val
  267. break
  268. }
  269. break
  270. }
  271. break
  272. }
  273. fmt.Fprint(w, strings.Repeat(" ", ind))
  274. switch numFmt {
  275. case "decimal", "aiueoFullWidth":
  276. key := fmt.Sprintf("%s:%d", numID, ind)
  277. cur, ok := zf.list[key]
  278. if !ok {
  279. zf.list[key] = start
  280. } else {
  281. zf.list[key] = cur + 1
  282. }
  283. fmt.Fprintf(w, "%d. ", zf.list[key])
  284. case "bullet":
  285. fmt.Fprint(w, "* ")
  286. }
  287. }
  288. }
  289. if code {
  290. fmt.Fprint(w, "`")
  291. }
  292. for _, n := range node.Nodes {
  293. if err := zf.walk(&n, w); err != nil {
  294. return err
  295. }
  296. }
  297. if code {
  298. fmt.Fprint(w, "`")
  299. }
  300. case "tbl":
  301. var rows [][]string
  302. for _, tr := range node.Nodes {
  303. if tr.XMLName.Local != "tr" {
  304. continue
  305. }
  306. var cols []string
  307. for _, tc := range tr.Nodes {
  308. if tc.XMLName.Local != "tc" {
  309. continue
  310. }
  311. var cbuf bytes.Buffer
  312. if err := zf.walk(&tc, &cbuf); err != nil {
  313. return err
  314. }
  315. cols = append(cols, strings.Replace(cbuf.String(), "\n", "", -1))
  316. }
  317. rows = append(rows, cols)
  318. }
  319. maxcol := 0
  320. for _, cols := range rows {
  321. if len(cols) > maxcol {
  322. maxcol = len(cols)
  323. }
  324. }
  325. widths := make([]int, maxcol)
  326. for _, row := range rows {
  327. for i := 0; i < maxcol; i++ {
  328. if i < len(row) {
  329. width := runewidth.StringWidth(row[i])
  330. if widths[i] < width {
  331. widths[i] = width
  332. }
  333. }
  334. }
  335. }
  336. for i, row := range rows {
  337. if i == 0 {
  338. for j := 0; j < maxcol; j++ {
  339. fmt.Fprint(w, "|")
  340. fmt.Fprint(w, strings.Repeat(" ", widths[j]))
  341. }
  342. fmt.Fprint(w, "|\n")
  343. for j := 0; j < maxcol; j++ {
  344. fmt.Fprint(w, "|")
  345. fmt.Fprint(w, strings.Repeat("-", widths[j]))
  346. }
  347. fmt.Fprint(w, "|\n")
  348. }
  349. for j := 0; j < maxcol; j++ {
  350. fmt.Fprint(w, "|")
  351. if j < len(row) {
  352. width := runewidth.StringWidth(row[j])
  353. fmt.Fprint(w, escape(row[j], "|"))
  354. fmt.Fprint(w, strings.Repeat(" ", widths[j]-width))
  355. } else {
  356. fmt.Fprint(w, strings.Repeat(" ", widths[j]))
  357. }
  358. }
  359. fmt.Fprint(w, "|\n")
  360. }
  361. fmt.Fprint(w, "\n")
  362. case "r":
  363. bold := false
  364. italic := false
  365. strike := false
  366. for _, n := range node.Nodes {
  367. if n.XMLName.Local != "rPr" {
  368. continue
  369. }
  370. for _, nn := range n.Nodes {
  371. switch nn.XMLName.Local {
  372. case "b":
  373. bold = true
  374. case "i":
  375. italic = true
  376. case "strike":
  377. strike = true
  378. }
  379. }
  380. }
  381. if strike {
  382. fmt.Fprint(w, "~~")
  383. }
  384. if bold {
  385. fmt.Fprint(w, "**")
  386. }
  387. if italic {
  388. fmt.Fprint(w, "*")
  389. }
  390. var cbuf bytes.Buffer
  391. for _, n := range node.Nodes {
  392. if err := zf.walk(&n, &cbuf); err != nil {
  393. return err
  394. }
  395. }
  396. fmt.Fprint(w, escape(cbuf.String(), `*~\`))
  397. if italic {
  398. fmt.Fprint(w, "*")
  399. }
  400. if bold {
  401. fmt.Fprint(w, "**")
  402. }
  403. if strike {
  404. fmt.Fprint(w, "~~")
  405. }
  406. case "p":
  407. for _, n := range node.Nodes {
  408. if err := zf.walk(&n, w); err != nil {
  409. return err
  410. }
  411. }
  412. fmt.Fprintln(w)
  413. case "blip":
  414. if id, ok := attr(node.Attrs, "embed"); ok {
  415. for _, rel := range zf.rels.Relationship {
  416. if id != rel.ID {
  417. continue
  418. }
  419. if err := zf.extract(&rel, w); err != nil {
  420. return err
  421. }
  422. }
  423. }
  424. case "Fallback":
  425. case "txbxContent":
  426. var cbuf bytes.Buffer
  427. for _, n := range node.Nodes {
  428. if err := zf.walk(&n, &cbuf); err != nil {
  429. return err
  430. }
  431. }
  432. fmt.Fprintln(w, "\n```\n"+cbuf.String()+"```")
  433. default:
  434. for _, n := range node.Nodes {
  435. if err := zf.walk(&n, w); err != nil {
  436. return err
  437. }
  438. }
  439. }
  440. return nil
  441. }
  442. func readFile(f *zip.File) (*Node, error) {
  443. rc, err := f.Open()
  444. defer rc.Close()
  445. b, _ := ioutil.ReadAll(rc)
  446. if err != nil {
  447. return nil, err
  448. }
  449. var node Node
  450. err = xml.Unmarshal(b, &node)
  451. if err != nil {
  452. return nil, err
  453. }
  454. return &node, nil
  455. }
  456. func findFile(files []*zip.File, target string) *zip.File {
  457. for _, f := range files {
  458. if ok, _ := path.Match(target, f.Name); ok {
  459. return f
  460. }
  461. }
  462. return nil
  463. }
  464. func Docx2md(arg string, embed bool) (string, error) {
  465. r, err := zip.OpenReader(arg)
  466. if err != nil {
  467. return "", err
  468. }
  469. defer r.Close()
  470. var rels Relationships
  471. var num Numbering
  472. for _, f := range r.File {
  473. switch f.Name {
  474. case "word/_rels/document.xml.rels":
  475. rc, err := f.Open()
  476. defer rc.Close()
  477. b, _ := ioutil.ReadAll(rc)
  478. if err != nil {
  479. return "", err
  480. }
  481. err = xml.Unmarshal(b, &rels)
  482. if err != nil {
  483. return "", err
  484. }
  485. case "word/numbering.xml":
  486. rc, err := f.Open()
  487. defer rc.Close()
  488. b, _ := ioutil.ReadAll(rc)
  489. if err != nil {
  490. return "", err
  491. }
  492. err = xml.Unmarshal(b, &num)
  493. if err != nil {
  494. return "", err
  495. }
  496. }
  497. }
  498. f := findFile(r.File, "word/document*.xml")
  499. if f == nil {
  500. return "", errors.New("incorrect document")
  501. }
  502. node, err := readFile(f)
  503. if err != nil {
  504. return "", err
  505. }
  506. fileNames := strings.Split(arg, "/")
  507. fileName := fileNames[len(fileNames)-1]
  508. // make sure the file name
  509. if !strings.HasSuffix(fileName, ".docx") {
  510. log.Fatal("File name must end with .docx")
  511. }
  512. var buf bytes.Buffer
  513. zf := &file{
  514. r: r,
  515. rels: rels,
  516. num: num,
  517. embed: embed,
  518. list: make(map[string]int),
  519. name: fileName,
  520. }
  521. err = zf.walk(node, &buf)
  522. if err != nil {
  523. return "", err
  524. }
  525. return buf.String(), nil
  526. }