1
0

docx2md.go 12 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571
  1. // https://github.com/mattn/docx2md
  2. // License MIT
  3. package utils
  4. import (
  5. "archive/zip"
  6. "bytes"
  7. "encoding/base64"
  8. "encoding/xml"
  9. "errors"
  10. _ "flag"
  11. "fmt"
  12. "io"
  13. "io/ioutil"
  14. "log"
  15. "os"
  16. "path"
  17. "path/filepath"
  18. _ "runtime"
  19. "strconv"
  20. "strings"
  21. "github.com/mattn/go-runewidth"
  22. )
  23. // Relationship is
  24. type Relationship struct {
  25. Text string `xml:",chardata"`
  26. ID string `xml:"Id,attr"`
  27. Type string `xml:"Type,attr"`
  28. Target string `xml:"Target,attr"`
  29. TargetMode string `xml:"TargetMode,attr"`
  30. }
  31. // Relationships is
  32. type Relationships struct {
  33. XMLName xml.Name `xml:"Relationships"`
  34. Text string `xml:",chardata"`
  35. Xmlns string `xml:"xmlns,attr"`
  36. Relationship []Relationship `xml:"Relationship"`
  37. }
  38. // TextVal is
  39. type TextVal struct {
  40. Text string `xml:",chardata"`
  41. Val string `xml:"val,attr"`
  42. }
  43. // NumberingLvl is
  44. type NumberingLvl struct {
  45. Text string `xml:",chardata"`
  46. Ilvl string `xml:"ilvl,attr"`
  47. Tplc string `xml:"tplc,attr"`
  48. Tentative string `xml:"tentative,attr"`
  49. Start TextVal `xml:"start"`
  50. NumFmt TextVal `xml:"numFmt"`
  51. LvlText TextVal `xml:"lvlText"`
  52. LvlJc TextVal `xml:"lvlJc"`
  53. PPr struct {
  54. Text string `xml:",chardata"`
  55. Ind struct {
  56. Text string `xml:",chardata"`
  57. Left string `xml:"left,attr"`
  58. Hanging string `xml:"hanging,attr"`
  59. } `xml:"ind"`
  60. } `xml:"pPr"`
  61. RPr struct {
  62. Text string `xml:",chardata"`
  63. U struct {
  64. Text string `xml:",chardata"`
  65. Val string `xml:"val,attr"`
  66. } `xml:"u"`
  67. RFonts struct {
  68. Text string `xml:",chardata"`
  69. Hint string `xml:"hint,attr"`
  70. } `xml:"rFonts"`
  71. } `xml:"rPr"`
  72. }
  73. // Numbering is
  74. type Numbering struct {
  75. XMLName xml.Name `xml:"numbering"`
  76. Text string `xml:",chardata"`
  77. Wpc string `xml:"wpc,attr"`
  78. Cx string `xml:"cx,attr"`
  79. Cx1 string `xml:"cx1,attr"`
  80. Mc string `xml:"mc,attr"`
  81. O string `xml:"o,attr"`
  82. R string `xml:"r,attr"`
  83. M string `xml:"m,attr"`
  84. V string `xml:"v,attr"`
  85. Wp14 string `xml:"wp14,attr"`
  86. Wp string `xml:"wp,attr"`
  87. W10 string `xml:"w10,attr"`
  88. W string `xml:"w,attr"`
  89. W14 string `xml:"w14,attr"`
  90. W15 string `xml:"w15,attr"`
  91. W16se string `xml:"w16se,attr"`
  92. Wpg string `xml:"wpg,attr"`
  93. Wpi string `xml:"wpi,attr"`
  94. Wne string `xml:"wne,attr"`
  95. Wps string `xml:"wps,attr"`
  96. Ignorable string `xml:"Ignorable,attr"`
  97. AbstractNum []struct {
  98. Text string `xml:",chardata"`
  99. AbstractNumID string `xml:"abstractNumId,attr"`
  100. RestartNumberingAfterBreak string `xml:"restartNumberingAfterBreak,attr"`
  101. Nsid TextVal `xml:"nsid"`
  102. MultiLevelType TextVal `xml:"multiLevelType"`
  103. Tmpl TextVal `xml:"tmpl"`
  104. Lvl []NumberingLvl `xml:"lvl"`
  105. } `xml:"abstractNum"`
  106. Num []struct {
  107. Text string `xml:",chardata"`
  108. NumID string `xml:"numId,attr"`
  109. AbstractNumID TextVal `xml:"abstractNumId"`
  110. } `xml:"num"`
  111. }
  112. type file struct {
  113. rels Relationships
  114. num Numbering
  115. r *zip.ReadCloser
  116. embed bool
  117. list map[string]int
  118. name string
  119. }
  120. // Node is
  121. type Node struct {
  122. XMLName xml.Name
  123. Attrs []xml.Attr `xml:"-"`
  124. Content []byte `xml:",innerxml"`
  125. Nodes []Node `xml:",any"`
  126. }
  127. // UnmarshalXML is
  128. func (n *Node) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
  129. n.Attrs = start.Attr
  130. type node Node
  131. return d.DecodeElement((*node)(n), &start)
  132. }
  133. func escape(s, set string) string {
  134. replacer := []string{}
  135. for _, r := range []rune(set) {
  136. rs := string(r)
  137. replacer = append(replacer, rs, `\`+rs)
  138. }
  139. return strings.NewReplacer(replacer...).Replace(s)
  140. }
  141. func (zf *file) extract(rel *Relationship, w io.Writer) error {
  142. err := os.MkdirAll(
  143. filepath.Join("uploads",
  144. strings.TrimSuffix(zf.name, ".docx"),
  145. filepath.Dir(rel.Target)),
  146. 0755)
  147. if err != nil {
  148. return err
  149. }
  150. for _, f := range zf.r.File {
  151. if f.Name != "word/"+rel.Target {
  152. continue
  153. }
  154. rc, err := f.Open()
  155. if err != nil {
  156. return err
  157. }
  158. defer rc.Close()
  159. b := make([]byte, f.UncompressedSize64)
  160. n, err := rc.Read(b)
  161. if err != nil && err != io.EOF {
  162. return err
  163. }
  164. if zf.embed {
  165. fmt.Fprintf(w, "![](data:image/png;base64,%s)",
  166. base64.StdEncoding.EncodeToString(b[:n]))
  167. } else {
  168. err = ioutil.WriteFile(
  169. filepath.Join("uploads",
  170. strings.TrimSuffix(zf.name, ".docx"),
  171. rel.Target),
  172. b, 0644)
  173. if err != nil {
  174. return err
  175. }
  176. fmt.Fprintf(w, "![](%s)", "/"+filepath.Join(
  177. "uploads",
  178. strings.TrimSuffix(zf.name, ".docx"),
  179. escape(rel.Target, "()")))
  180. }
  181. break
  182. }
  183. return nil
  184. }
  185. func attr(attrs []xml.Attr, name string) (string, bool) {
  186. for _, attr := range attrs {
  187. if attr.Name.Local == name {
  188. return attr.Value, true
  189. }
  190. }
  191. return "", false
  192. }
  193. func (zf *file) walk(node *Node, w io.Writer) error {
  194. switch node.XMLName.Local {
  195. case "hyperlink":
  196. fmt.Fprint(w, "[")
  197. var cbuf bytes.Buffer
  198. for _, n := range node.Nodes {
  199. if err := zf.walk(&n, &cbuf); err != nil {
  200. return err
  201. }
  202. }
  203. fmt.Fprint(w, escape(cbuf.String(), "[]"))
  204. fmt.Fprint(w, "]")
  205. fmt.Fprint(w, "(")
  206. if id, ok := attr(node.Attrs, "id"); ok {
  207. for _, rel := range zf.rels.Relationship {
  208. if id == rel.ID {
  209. fmt.Fprint(w, escape(rel.Target, "()"))
  210. break
  211. }
  212. }
  213. }
  214. fmt.Fprint(w, ")")
  215. case "t":
  216. fmt.Fprint(w, string(node.Content))
  217. case "pPr":
  218. code := false
  219. for _, n := range node.Nodes {
  220. switch n.XMLName.Local {
  221. case "ind":
  222. if left, ok := attr(n.Attrs, "left"); ok {
  223. if i, err := strconv.Atoi(left); err == nil && i > 0 {
  224. fmt.Fprint(w, strings.Repeat(" ", i/360))
  225. }
  226. }
  227. case "pStyle":
  228. if val, ok := attr(n.Attrs, "val"); ok {
  229. if strings.HasPrefix(val, "Heading") {
  230. if i, err := strconv.Atoi(val[7:]); err == nil && i > 0 {
  231. fmt.Fprint(w, strings.Repeat("#", i)+" ")
  232. }
  233. } else if val == "Code" {
  234. code = true
  235. } else {
  236. if i, err := strconv.Atoi(val); err == nil && i > 0 {
  237. fmt.Fprint(w, strings.Repeat("#", i)+" ")
  238. }
  239. }
  240. }
  241. case "numPr":
  242. numID := ""
  243. ilvl := ""
  244. numFmt := ""
  245. start := 1
  246. ind := 0
  247. for _, nn := range n.Nodes {
  248. if nn.XMLName.Local == "numId" {
  249. if val, ok := attr(nn.Attrs, "val"); ok {
  250. numID = val
  251. }
  252. }
  253. if nn.XMLName.Local == "ilvl" {
  254. if val, ok := attr(nn.Attrs, "val"); ok {
  255. ilvl = val
  256. }
  257. }
  258. }
  259. for _, num := range zf.num.Num {
  260. if numID != num.NumID {
  261. continue
  262. }
  263. for _, abnum := range zf.num.AbstractNum {
  264. if abnum.AbstractNumID != num.AbstractNumID.Val {
  265. continue
  266. }
  267. for _, ablvl := range abnum.Lvl {
  268. if ablvl.Ilvl != ilvl {
  269. continue
  270. }
  271. if i, err := strconv.Atoi(ablvl.Start.Val); err == nil {
  272. start = i
  273. }
  274. if i, err := strconv.Atoi(ablvl.PPr.Ind.Left); err == nil {
  275. ind = i / 360
  276. }
  277. numFmt = ablvl.NumFmt.Val
  278. break
  279. }
  280. break
  281. }
  282. break
  283. }
  284. fmt.Fprint(w, strings.Repeat(" ", ind))
  285. switch numFmt {
  286. case "decimal", "aiueoFullWidth":
  287. key := fmt.Sprintf("%s:%d", numID, ind)
  288. cur, ok := zf.list[key]
  289. if !ok {
  290. zf.list[key] = start
  291. } else {
  292. zf.list[key] = cur + 1
  293. }
  294. fmt.Fprintf(w, "%d. ", zf.list[key])
  295. case "bullet":
  296. fmt.Fprint(w, "* ")
  297. }
  298. }
  299. }
  300. if code {
  301. fmt.Fprint(w, "`")
  302. }
  303. for _, n := range node.Nodes {
  304. if err := zf.walk(&n, w); err != nil {
  305. return err
  306. }
  307. }
  308. if code {
  309. fmt.Fprint(w, "`")
  310. }
  311. case "tbl":
  312. var rows [][]string
  313. for _, tr := range node.Nodes {
  314. if tr.XMLName.Local != "tr" {
  315. continue
  316. }
  317. var cols []string
  318. for _, tc := range tr.Nodes {
  319. if tc.XMLName.Local != "tc" {
  320. continue
  321. }
  322. var cbuf bytes.Buffer
  323. if err := zf.walk(&tc, &cbuf); err != nil {
  324. return err
  325. }
  326. cols = append(cols, strings.Replace(cbuf.String(), "\n", "", -1))
  327. }
  328. rows = append(rows, cols)
  329. }
  330. maxcol := 0
  331. for _, cols := range rows {
  332. if len(cols) > maxcol {
  333. maxcol = len(cols)
  334. }
  335. }
  336. widths := make([]int, maxcol)
  337. for _, row := range rows {
  338. for i := 0; i < maxcol; i++ {
  339. if i < len(row) {
  340. width := runewidth.StringWidth(row[i])
  341. if widths[i] < width {
  342. widths[i] = width
  343. }
  344. }
  345. }
  346. }
  347. for i, row := range rows {
  348. if i == 0 {
  349. for j := 0; j < maxcol; j++ {
  350. fmt.Fprint(w, "|")
  351. fmt.Fprint(w, strings.Repeat(" ", widths[j]))
  352. }
  353. fmt.Fprint(w, "|\n")
  354. for j := 0; j < maxcol; j++ {
  355. fmt.Fprint(w, "|")
  356. fmt.Fprint(w, strings.Repeat("-", widths[j]))
  357. }
  358. fmt.Fprint(w, "|\n")
  359. }
  360. for j := 0; j < maxcol; j++ {
  361. fmt.Fprint(w, "|")
  362. if j < len(row) {
  363. width := runewidth.StringWidth(row[j])
  364. fmt.Fprint(w, escape(row[j], "|"))
  365. fmt.Fprint(w, strings.Repeat(" ", widths[j]-width))
  366. } else {
  367. fmt.Fprint(w, strings.Repeat(" ", widths[j]))
  368. }
  369. }
  370. fmt.Fprint(w, "|\n")
  371. }
  372. fmt.Fprint(w, "\n")
  373. case "r":
  374. bold := false
  375. italic := false
  376. strike := false
  377. for _, n := range node.Nodes {
  378. if n.XMLName.Local != "rPr" {
  379. continue
  380. }
  381. for _, nn := range n.Nodes {
  382. switch nn.XMLName.Local {
  383. case "b":
  384. bold = true
  385. case "i":
  386. italic = true
  387. case "strike":
  388. strike = true
  389. }
  390. }
  391. }
  392. if strike {
  393. fmt.Fprint(w, "~~")
  394. }
  395. if bold {
  396. fmt.Fprint(w, "**")
  397. }
  398. if italic {
  399. fmt.Fprint(w, "*")
  400. }
  401. var cbuf bytes.Buffer
  402. for _, n := range node.Nodes {
  403. if err := zf.walk(&n, &cbuf); err != nil {
  404. return err
  405. }
  406. }
  407. fmt.Fprint(w, escape(cbuf.String(), `*~\`))
  408. if italic {
  409. fmt.Fprint(w, "*")
  410. }
  411. if bold {
  412. fmt.Fprint(w, "**")
  413. }
  414. if strike {
  415. fmt.Fprint(w, "~~")
  416. }
  417. case "p":
  418. for _, n := range node.Nodes {
  419. if err := zf.walk(&n, w); err != nil {
  420. return err
  421. }
  422. }
  423. fmt.Fprintln(w)
  424. case "blip":
  425. if id, ok := attr(node.Attrs, "embed"); ok {
  426. for _, rel := range zf.rels.Relationship {
  427. if id != rel.ID {
  428. continue
  429. }
  430. if err := zf.extract(&rel, w); err != nil {
  431. return err
  432. }
  433. }
  434. }
  435. case "Fallback":
  436. case "txbxContent":
  437. var cbuf bytes.Buffer
  438. for _, n := range node.Nodes {
  439. if err := zf.walk(&n, &cbuf); err != nil {
  440. return err
  441. }
  442. }
  443. fmt.Fprintln(w, "\n```\n"+cbuf.String()+"```")
  444. default:
  445. for _, n := range node.Nodes {
  446. if err := zf.walk(&n, w); err != nil {
  447. return err
  448. }
  449. }
  450. }
  451. return nil
  452. }
  453. func readFile(f *zip.File) (*Node, error) {
  454. rc, err := f.Open()
  455. defer rc.Close()
  456. b, _ := ioutil.ReadAll(rc)
  457. if err != nil {
  458. return nil, err
  459. }
  460. var node Node
  461. err = xml.Unmarshal(b, &node)
  462. if err != nil {
  463. return nil, err
  464. }
  465. return &node, nil
  466. }
  467. func findFile(files []*zip.File, target string) *zip.File {
  468. for _, f := range files {
  469. if ok, _ := path.Match(target, f.Name); ok {
  470. return f
  471. }
  472. }
  473. return nil
  474. }
  475. func Docx2md(arg string, embed bool) (string, error) {
  476. r, err := zip.OpenReader(arg)
  477. if err != nil {
  478. return "", err
  479. }
  480. defer r.Close()
  481. var rels Relationships
  482. var num Numbering
  483. for _, f := range r.File {
  484. switch f.Name {
  485. case "word/_rels/document.xml.rels":
  486. rc, err := f.Open()
  487. defer rc.Close()
  488. b, _ := ioutil.ReadAll(rc)
  489. if err != nil {
  490. return "", err
  491. }
  492. err = xml.Unmarshal(b, &rels)
  493. if err != nil {
  494. return "", err
  495. }
  496. case "word/numbering.xml":
  497. rc, err := f.Open()
  498. defer rc.Close()
  499. b, _ := ioutil.ReadAll(rc)
  500. if err != nil {
  501. return "", err
  502. }
  503. err = xml.Unmarshal(b, &num)
  504. if err != nil {
  505. return "", err
  506. }
  507. }
  508. }
  509. f := findFile(r.File, "word/document*.xml")
  510. if f == nil {
  511. return "", errors.New("incorrect document")
  512. }
  513. node, err := readFile(f)
  514. if err != nil {
  515. return "", err
  516. }
  517. fileNames := strings.Split(arg, "/")
  518. fileName := fileNames[len(fileNames)-1]
  519. // make sure the file name
  520. if !strings.HasSuffix(fileName, ".docx") {
  521. log.Fatal("File name must end with .docx")
  522. }
  523. var buf bytes.Buffer
  524. zf := &file{
  525. r: r,
  526. rels: rels,
  527. num: num,
  528. embed: embed,
  529. list: make(map[string]int),
  530. name: fileName,
  531. }
  532. err = zf.walk(node, &buf)
  533. if err != nil {
  534. return "", err
  535. }
  536. return buf.String(), nil
  537. }