|
@@ -0,0 +1,571 @@
|
|
|
+// https://github.com/mattn/docx2md
|
|
|
+// License MIT
|
|
|
+package utils
|
|
|
+
|
|
|
+import (
|
|
|
+ "archive/zip"
|
|
|
+ "bytes"
|
|
|
+ "encoding/base64"
|
|
|
+ "encoding/xml"
|
|
|
+ "errors"
|
|
|
+ _ "flag"
|
|
|
+ "fmt"
|
|
|
+ "io"
|
|
|
+ "io/ioutil"
|
|
|
+ "log"
|
|
|
+ "os"
|
|
|
+ "path"
|
|
|
+ "path/filepath"
|
|
|
+ _ "runtime"
|
|
|
+ "strconv"
|
|
|
+ "strings"
|
|
|
+
|
|
|
+ "github.com/mattn/go-runewidth"
|
|
|
+)
|
|
|
+
|
|
|
+// Relationship is
|
|
|
+type Relationship struct {
|
|
|
+ Text string `xml:",chardata"`
|
|
|
+ ID string `xml:"Id,attr"`
|
|
|
+ Type string `xml:"Type,attr"`
|
|
|
+ Target string `xml:"Target,attr"`
|
|
|
+ TargetMode string `xml:"TargetMode,attr"`
|
|
|
+}
|
|
|
+
|
|
|
+// Relationships is
|
|
|
+type Relationships struct {
|
|
|
+ XMLName xml.Name `xml:"Relationships"`
|
|
|
+ Text string `xml:",chardata"`
|
|
|
+ Xmlns string `xml:"xmlns,attr"`
|
|
|
+ Relationship []Relationship `xml:"Relationship"`
|
|
|
+}
|
|
|
+
|
|
|
+// TextVal is
|
|
|
+type TextVal struct {
|
|
|
+ Text string `xml:",chardata"`
|
|
|
+ Val string `xml:"val,attr"`
|
|
|
+}
|
|
|
+
|
|
|
+// NumberingLvl is
|
|
|
+type NumberingLvl struct {
|
|
|
+ Text string `xml:",chardata"`
|
|
|
+ Ilvl string `xml:"ilvl,attr"`
|
|
|
+ Tplc string `xml:"tplc,attr"`
|
|
|
+ Tentative string `xml:"tentative,attr"`
|
|
|
+ Start TextVal `xml:"start"`
|
|
|
+ NumFmt TextVal `xml:"numFmt"`
|
|
|
+ LvlText TextVal `xml:"lvlText"`
|
|
|
+ LvlJc TextVal `xml:"lvlJc"`
|
|
|
+ PPr struct {
|
|
|
+ Text string `xml:",chardata"`
|
|
|
+ Ind struct {
|
|
|
+ Text string `xml:",chardata"`
|
|
|
+ Left string `xml:"left,attr"`
|
|
|
+ Hanging string `xml:"hanging,attr"`
|
|
|
+ } `xml:"ind"`
|
|
|
+ } `xml:"pPr"`
|
|
|
+ RPr struct {
|
|
|
+ Text string `xml:",chardata"`
|
|
|
+ U struct {
|
|
|
+ Text string `xml:",chardata"`
|
|
|
+ Val string `xml:"val,attr"`
|
|
|
+ } `xml:"u"`
|
|
|
+ RFonts struct {
|
|
|
+ Text string `xml:",chardata"`
|
|
|
+ Hint string `xml:"hint,attr"`
|
|
|
+ } `xml:"rFonts"`
|
|
|
+ } `xml:"rPr"`
|
|
|
+}
|
|
|
+
|
|
|
+// Numbering is
|
|
|
+type Numbering struct {
|
|
|
+ XMLName xml.Name `xml:"numbering"`
|
|
|
+ Text string `xml:",chardata"`
|
|
|
+ Wpc string `xml:"wpc,attr"`
|
|
|
+ Cx string `xml:"cx,attr"`
|
|
|
+ Cx1 string `xml:"cx1,attr"`
|
|
|
+ Mc string `xml:"mc,attr"`
|
|
|
+ O string `xml:"o,attr"`
|
|
|
+ R string `xml:"r,attr"`
|
|
|
+ M string `xml:"m,attr"`
|
|
|
+ V string `xml:"v,attr"`
|
|
|
+ Wp14 string `xml:"wp14,attr"`
|
|
|
+ Wp string `xml:"wp,attr"`
|
|
|
+ W10 string `xml:"w10,attr"`
|
|
|
+ W string `xml:"w,attr"`
|
|
|
+ W14 string `xml:"w14,attr"`
|
|
|
+ W15 string `xml:"w15,attr"`
|
|
|
+ W16se string `xml:"w16se,attr"`
|
|
|
+ Wpg string `xml:"wpg,attr"`
|
|
|
+ Wpi string `xml:"wpi,attr"`
|
|
|
+ Wne string `xml:"wne,attr"`
|
|
|
+ Wps string `xml:"wps,attr"`
|
|
|
+ Ignorable string `xml:"Ignorable,attr"`
|
|
|
+ AbstractNum []struct {
|
|
|
+ Text string `xml:",chardata"`
|
|
|
+ AbstractNumID string `xml:"abstractNumId,attr"`
|
|
|
+ RestartNumberingAfterBreak string `xml:"restartNumberingAfterBreak,attr"`
|
|
|
+ Nsid TextVal `xml:"nsid"`
|
|
|
+ MultiLevelType TextVal `xml:"multiLevelType"`
|
|
|
+ Tmpl TextVal `xml:"tmpl"`
|
|
|
+ Lvl []NumberingLvl `xml:"lvl"`
|
|
|
+ } `xml:"abstractNum"`
|
|
|
+ Num []struct {
|
|
|
+ Text string `xml:",chardata"`
|
|
|
+ NumID string `xml:"numId,attr"`
|
|
|
+ AbstractNumID TextVal `xml:"abstractNumId"`
|
|
|
+ } `xml:"num"`
|
|
|
+}
|
|
|
+
|
|
|
+type file struct {
|
|
|
+ rels Relationships
|
|
|
+ num Numbering
|
|
|
+ r *zip.ReadCloser
|
|
|
+ embed bool
|
|
|
+ list map[string]int
|
|
|
+ name string
|
|
|
+}
|
|
|
+
|
|
|
+// Node is
|
|
|
+type Node struct {
|
|
|
+ XMLName xml.Name
|
|
|
+ Attrs []xml.Attr `xml:"-"`
|
|
|
+ Content []byte `xml:",innerxml"`
|
|
|
+ Nodes []Node `xml:",any"`
|
|
|
+}
|
|
|
+
|
|
|
+// UnmarshalXML is
|
|
|
+func (n *Node) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
|
|
|
+ n.Attrs = start.Attr
|
|
|
+ type node Node
|
|
|
+
|
|
|
+ return d.DecodeElement((*node)(n), &start)
|
|
|
+}
|
|
|
+
|
|
|
+func escape(s, set string) string {
|
|
|
+ replacer := []string{}
|
|
|
+ for _, r := range []rune(set) {
|
|
|
+ rs := string(r)
|
|
|
+ replacer = append(replacer, rs, `\`+rs)
|
|
|
+ }
|
|
|
+ return strings.NewReplacer(replacer...).Replace(s)
|
|
|
+}
|
|
|
+
|
|
|
+func (zf *file) extract(rel *Relationship, w io.Writer) error {
|
|
|
+ err := os.MkdirAll(
|
|
|
+ filepath.Join("uploads",
|
|
|
+ strings.TrimSuffix(zf.name, ".docx"),
|
|
|
+ filepath.Dir(rel.Target)),
|
|
|
+ 0755)
|
|
|
+ if err != nil {
|
|
|
+ return err
|
|
|
+ }
|
|
|
+ for _, f := range zf.r.File {
|
|
|
+ if f.Name != "word/"+rel.Target {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ rc, err := f.Open()
|
|
|
+ if err != nil {
|
|
|
+ return err
|
|
|
+ }
|
|
|
+ defer rc.Close()
|
|
|
+
|
|
|
+ b := make([]byte, f.UncompressedSize64)
|
|
|
+ n, err := rc.Read(b)
|
|
|
+ if err != nil && err != io.EOF {
|
|
|
+ return err
|
|
|
+ }
|
|
|
+ if zf.embed {
|
|
|
+ fmt.Fprintf(w, "",
|
|
|
+ base64.StdEncoding.EncodeToString(b[:n]))
|
|
|
+ } else {
|
|
|
+ err = ioutil.WriteFile(
|
|
|
+ filepath.Join("uploads",
|
|
|
+ strings.TrimSuffix(zf.name, ".docx"),
|
|
|
+ rel.Target),
|
|
|
+ b, 0644)
|
|
|
+ if err != nil {
|
|
|
+ return err
|
|
|
+ }
|
|
|
+ fmt.Fprintf(w, "", "/"+filepath.Join(
|
|
|
+ "uploads",
|
|
|
+ strings.TrimSuffix(zf.name, ".docx"),
|
|
|
+ escape(rel.Target, "()")))
|
|
|
+ }
|
|
|
+ break
|
|
|
+ }
|
|
|
+ return nil
|
|
|
+}
|
|
|
+
|
|
|
+func attr(attrs []xml.Attr, name string) (string, bool) {
|
|
|
+ for _, attr := range attrs {
|
|
|
+ if attr.Name.Local == name {
|
|
|
+ return attr.Value, true
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return "", false
|
|
|
+}
|
|
|
+
|
|
|
+func (zf *file) walk(node *Node, w io.Writer) error {
|
|
|
+ switch node.XMLName.Local {
|
|
|
+ case "hyperlink":
|
|
|
+ fmt.Fprint(w, "[")
|
|
|
+ var cbuf bytes.Buffer
|
|
|
+ for _, n := range node.Nodes {
|
|
|
+ if err := zf.walk(&n, &cbuf); err != nil {
|
|
|
+ return err
|
|
|
+ }
|
|
|
+ }
|
|
|
+ fmt.Fprint(w, escape(cbuf.String(), "[]"))
|
|
|
+ fmt.Fprint(w, "]")
|
|
|
+
|
|
|
+ fmt.Fprint(w, "(")
|
|
|
+ if id, ok := attr(node.Attrs, "id"); ok {
|
|
|
+ for _, rel := range zf.rels.Relationship {
|
|
|
+ if id == rel.ID {
|
|
|
+ fmt.Fprint(w, escape(rel.Target, "()"))
|
|
|
+ break
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ fmt.Fprint(w, ")")
|
|
|
+ case "t":
|
|
|
+ fmt.Fprint(w, string(node.Content))
|
|
|
+ case "pPr":
|
|
|
+ code := false
|
|
|
+ for _, n := range node.Nodes {
|
|
|
+ switch n.XMLName.Local {
|
|
|
+ case "ind":
|
|
|
+ if left, ok := attr(n.Attrs, "left"); ok {
|
|
|
+ if i, err := strconv.Atoi(left); err == nil && i > 0 {
|
|
|
+ fmt.Fprint(w, strings.Repeat(" ", i/360))
|
|
|
+ }
|
|
|
+ }
|
|
|
+ case "pStyle":
|
|
|
+ if val, ok := attr(n.Attrs, "val"); ok {
|
|
|
+ if strings.HasPrefix(val, "Heading") {
|
|
|
+ if i, err := strconv.Atoi(val[7:]); err == nil && i > 0 {
|
|
|
+ fmt.Fprint(w, strings.Repeat("#", i)+" ")
|
|
|
+ }
|
|
|
+ } else if val == "Code" {
|
|
|
+ code = true
|
|
|
+ } else {
|
|
|
+ if i, err := strconv.Atoi(val); err == nil && i > 0 {
|
|
|
+ fmt.Fprint(w, strings.Repeat("#", i)+" ")
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ case "numPr":
|
|
|
+ numID := ""
|
|
|
+ ilvl := ""
|
|
|
+ numFmt := ""
|
|
|
+ start := 1
|
|
|
+ ind := 0
|
|
|
+ for _, nn := range n.Nodes {
|
|
|
+ if nn.XMLName.Local == "numId" {
|
|
|
+ if val, ok := attr(nn.Attrs, "val"); ok {
|
|
|
+ numID = val
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if nn.XMLName.Local == "ilvl" {
|
|
|
+ if val, ok := attr(nn.Attrs, "val"); ok {
|
|
|
+ ilvl = val
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ for _, num := range zf.num.Num {
|
|
|
+ if numID != num.NumID {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ for _, abnum := range zf.num.AbstractNum {
|
|
|
+ if abnum.AbstractNumID != num.AbstractNumID.Val {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ for _, ablvl := range abnum.Lvl {
|
|
|
+ if ablvl.Ilvl != ilvl {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ if i, err := strconv.Atoi(ablvl.Start.Val); err == nil {
|
|
|
+ start = i
|
|
|
+ }
|
|
|
+ if i, err := strconv.Atoi(ablvl.PPr.Ind.Left); err == nil {
|
|
|
+ ind = i / 360
|
|
|
+ }
|
|
|
+ numFmt = ablvl.NumFmt.Val
|
|
|
+ break
|
|
|
+ }
|
|
|
+ break
|
|
|
+ }
|
|
|
+ break
|
|
|
+ }
|
|
|
+
|
|
|
+ fmt.Fprint(w, strings.Repeat(" ", ind))
|
|
|
+ switch numFmt {
|
|
|
+ case "decimal", "aiueoFullWidth":
|
|
|
+ key := fmt.Sprintf("%s:%d", numID, ind)
|
|
|
+ cur, ok := zf.list[key]
|
|
|
+ if !ok {
|
|
|
+ zf.list[key] = start
|
|
|
+ } else {
|
|
|
+ zf.list[key] = cur + 1
|
|
|
+ }
|
|
|
+ fmt.Fprintf(w, "%d. ", zf.list[key])
|
|
|
+ case "bullet":
|
|
|
+ fmt.Fprint(w, "* ")
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if code {
|
|
|
+ fmt.Fprint(w, "`")
|
|
|
+ }
|
|
|
+ for _, n := range node.Nodes {
|
|
|
+ if err := zf.walk(&n, w); err != nil {
|
|
|
+ return err
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if code {
|
|
|
+ fmt.Fprint(w, "`")
|
|
|
+ }
|
|
|
+ case "tbl":
|
|
|
+ var rows [][]string
|
|
|
+ for _, tr := range node.Nodes {
|
|
|
+ if tr.XMLName.Local != "tr" {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ var cols []string
|
|
|
+ for _, tc := range tr.Nodes {
|
|
|
+ if tc.XMLName.Local != "tc" {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ var cbuf bytes.Buffer
|
|
|
+ if err := zf.walk(&tc, &cbuf); err != nil {
|
|
|
+ return err
|
|
|
+ }
|
|
|
+ cols = append(cols, strings.Replace(cbuf.String(), "\n", "", -1))
|
|
|
+ }
|
|
|
+ rows = append(rows, cols)
|
|
|
+ }
|
|
|
+ maxcol := 0
|
|
|
+ for _, cols := range rows {
|
|
|
+ if len(cols) > maxcol {
|
|
|
+ maxcol = len(cols)
|
|
|
+ }
|
|
|
+ }
|
|
|
+ widths := make([]int, maxcol)
|
|
|
+ for _, row := range rows {
|
|
|
+ for i := 0; i < maxcol; i++ {
|
|
|
+ if i < len(row) {
|
|
|
+ width := runewidth.StringWidth(row[i])
|
|
|
+ if widths[i] < width {
|
|
|
+ widths[i] = width
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ for i, row := range rows {
|
|
|
+ if i == 0 {
|
|
|
+ for j := 0; j < maxcol; j++ {
|
|
|
+ fmt.Fprint(w, "|")
|
|
|
+ fmt.Fprint(w, strings.Repeat(" ", widths[j]))
|
|
|
+ }
|
|
|
+ fmt.Fprint(w, "|\n")
|
|
|
+ for j := 0; j < maxcol; j++ {
|
|
|
+ fmt.Fprint(w, "|")
|
|
|
+ fmt.Fprint(w, strings.Repeat("-", widths[j]))
|
|
|
+ }
|
|
|
+ fmt.Fprint(w, "|\n")
|
|
|
+ }
|
|
|
+ for j := 0; j < maxcol; j++ {
|
|
|
+ fmt.Fprint(w, "|")
|
|
|
+ if j < len(row) {
|
|
|
+ width := runewidth.StringWidth(row[j])
|
|
|
+ fmt.Fprint(w, escape(row[j], "|"))
|
|
|
+ fmt.Fprint(w, strings.Repeat(" ", widths[j]-width))
|
|
|
+ } else {
|
|
|
+ fmt.Fprint(w, strings.Repeat(" ", widths[j]))
|
|
|
+ }
|
|
|
+ }
|
|
|
+ fmt.Fprint(w, "|\n")
|
|
|
+ }
|
|
|
+ fmt.Fprint(w, "\n")
|
|
|
+ case "r":
|
|
|
+ bold := false
|
|
|
+ italic := false
|
|
|
+ strike := false
|
|
|
+ for _, n := range node.Nodes {
|
|
|
+ if n.XMLName.Local != "rPr" {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ for _, nn := range n.Nodes {
|
|
|
+ switch nn.XMLName.Local {
|
|
|
+ case "b":
|
|
|
+ bold = true
|
|
|
+ case "i":
|
|
|
+ italic = true
|
|
|
+ case "strike":
|
|
|
+ strike = true
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ if strike {
|
|
|
+ fmt.Fprint(w, "~~")
|
|
|
+ }
|
|
|
+ if bold {
|
|
|
+ fmt.Fprint(w, "**")
|
|
|
+ }
|
|
|
+ if italic {
|
|
|
+ fmt.Fprint(w, "*")
|
|
|
+ }
|
|
|
+ var cbuf bytes.Buffer
|
|
|
+ for _, n := range node.Nodes {
|
|
|
+ if err := zf.walk(&n, &cbuf); err != nil {
|
|
|
+ return err
|
|
|
+ }
|
|
|
+ }
|
|
|
+ fmt.Fprint(w, escape(cbuf.String(), `*~\`))
|
|
|
+ if italic {
|
|
|
+ fmt.Fprint(w, "*")
|
|
|
+ }
|
|
|
+ if bold {
|
|
|
+ fmt.Fprint(w, "**")
|
|
|
+ }
|
|
|
+ if strike {
|
|
|
+ fmt.Fprint(w, "~~")
|
|
|
+ }
|
|
|
+ case "p":
|
|
|
+ for _, n := range node.Nodes {
|
|
|
+ if err := zf.walk(&n, w); err != nil {
|
|
|
+ return err
|
|
|
+ }
|
|
|
+ }
|
|
|
+ fmt.Fprintln(w)
|
|
|
+ case "blip":
|
|
|
+ if id, ok := attr(node.Attrs, "embed"); ok {
|
|
|
+ for _, rel := range zf.rels.Relationship {
|
|
|
+ if id != rel.ID {
|
|
|
+ continue
|
|
|
+ }
|
|
|
+ if err := zf.extract(&rel, w); err != nil {
|
|
|
+ return err
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+ case "Fallback":
|
|
|
+ case "txbxContent":
|
|
|
+ var cbuf bytes.Buffer
|
|
|
+ for _, n := range node.Nodes {
|
|
|
+ if err := zf.walk(&n, &cbuf); err != nil {
|
|
|
+ return err
|
|
|
+ }
|
|
|
+ }
|
|
|
+ fmt.Fprintln(w, "\n```\n"+cbuf.String()+"```")
|
|
|
+ default:
|
|
|
+ for _, n := range node.Nodes {
|
|
|
+ if err := zf.walk(&n, w); err != nil {
|
|
|
+ return err
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ return nil
|
|
|
+}
|
|
|
+
|
|
|
+func readFile(f *zip.File) (*Node, error) {
|
|
|
+ rc, err := f.Open()
|
|
|
+ defer rc.Close()
|
|
|
+
|
|
|
+ b, _ := ioutil.ReadAll(rc)
|
|
|
+ if err != nil {
|
|
|
+ return nil, err
|
|
|
+ }
|
|
|
+
|
|
|
+ var node Node
|
|
|
+ err = xml.Unmarshal(b, &node)
|
|
|
+ if err != nil {
|
|
|
+ return nil, err
|
|
|
+ }
|
|
|
+ return &node, nil
|
|
|
+}
|
|
|
+
|
|
|
+func findFile(files []*zip.File, target string) *zip.File {
|
|
|
+ for _, f := range files {
|
|
|
+ if ok, _ := path.Match(target, f.Name); ok {
|
|
|
+ return f
|
|
|
+ }
|
|
|
+ }
|
|
|
+ return nil
|
|
|
+}
|
|
|
+
|
|
|
+func Docx2md(arg string, embed bool) (string, error) {
|
|
|
+ r, err := zip.OpenReader(arg)
|
|
|
+ if err != nil {
|
|
|
+ return "", err
|
|
|
+ }
|
|
|
+ defer r.Close()
|
|
|
+
|
|
|
+ var rels Relationships
|
|
|
+ var num Numbering
|
|
|
+
|
|
|
+ for _, f := range r.File {
|
|
|
+ switch f.Name {
|
|
|
+ case "word/_rels/document.xml.rels":
|
|
|
+ rc, err := f.Open()
|
|
|
+ defer rc.Close()
|
|
|
+
|
|
|
+ b, _ := ioutil.ReadAll(rc)
|
|
|
+ if err != nil {
|
|
|
+ return "", err
|
|
|
+ }
|
|
|
+
|
|
|
+ err = xml.Unmarshal(b, &rels)
|
|
|
+ if err != nil {
|
|
|
+ return "", err
|
|
|
+ }
|
|
|
+ case "word/numbering.xml":
|
|
|
+ rc, err := f.Open()
|
|
|
+ defer rc.Close()
|
|
|
+
|
|
|
+ b, _ := ioutil.ReadAll(rc)
|
|
|
+ if err != nil {
|
|
|
+ return "", err
|
|
|
+ }
|
|
|
+
|
|
|
+ err = xml.Unmarshal(b, &num)
|
|
|
+ if err != nil {
|
|
|
+ return "", err
|
|
|
+ }
|
|
|
+ }
|
|
|
+ }
|
|
|
+
|
|
|
+ f := findFile(r.File, "word/document*.xml")
|
|
|
+ if f == nil {
|
|
|
+ return "", errors.New("incorrect document")
|
|
|
+ }
|
|
|
+ node, err := readFile(f)
|
|
|
+ if err != nil {
|
|
|
+ return "", err
|
|
|
+ }
|
|
|
+
|
|
|
+ fileNames := strings.Split(arg, "/")
|
|
|
+ fileName := fileNames[len(fileNames)-1]
|
|
|
+ // make sure the file name
|
|
|
+ if !strings.HasSuffix(fileName, ".docx") {
|
|
|
+ log.Fatal("File name must end with .docx")
|
|
|
+ }
|
|
|
+
|
|
|
+ var buf bytes.Buffer
|
|
|
+ zf := &file{
|
|
|
+ r: r,
|
|
|
+ rels: rels,
|
|
|
+ num: num,
|
|
|
+ embed: embed,
|
|
|
+ list: make(map[string]int),
|
|
|
+ name: fileName,
|
|
|
+ }
|
|
|
+ err = zf.walk(node, &buf)
|
|
|
+ if err != nil {
|
|
|
+ return "", err
|
|
|
+ }
|
|
|
+
|
|
|
+ return buf.String(), nil
|
|
|
+}
|