123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352353354355356357358359360361362363364365366367368369370371372373374375376377378379380381382383384385386387388389390391392393394395396397398399400401402403404405406407408409410411412413414415416417418419420421422423424425426427428429430431432433434435436437438439440441442443444445446447448449450451452453454455456457458459460461462463464465466467468469470471472473474475476477478479480481482483484485486487488489490491492493494495496497498499500501502503504505506507508509510511512513514515516517518519520521522523524525526527528529530531532533534535536537538539540541542543544545546547548549550551552553554555556557558559560561562563564565566567568569570571 |
- // https://github.com/mattn/docx2md
- // License MIT
- package utils
- import (
- "archive/zip"
- "bytes"
- "encoding/base64"
- "encoding/xml"
- "errors"
- _ "flag"
- "fmt"
- "io"
- "io/ioutil"
- "log"
- "os"
- "path"
- "path/filepath"
- _ "runtime"
- "strconv"
- "strings"
- "github.com/mattn/go-runewidth"
- )
- // Relationship is
- type Relationship struct {
- Text string `xml:",chardata"`
- ID string `xml:"Id,attr"`
- Type string `xml:"Type,attr"`
- Target string `xml:"Target,attr"`
- TargetMode string `xml:"TargetMode,attr"`
- }
- // Relationships is
- type Relationships struct {
- XMLName xml.Name `xml:"Relationships"`
- Text string `xml:",chardata"`
- Xmlns string `xml:"xmlns,attr"`
- Relationship []Relationship `xml:"Relationship"`
- }
- // TextVal is
- type TextVal struct {
- Text string `xml:",chardata"`
- Val string `xml:"val,attr"`
- }
- // NumberingLvl is
- type NumberingLvl struct {
- Text string `xml:",chardata"`
- Ilvl string `xml:"ilvl,attr"`
- Tplc string `xml:"tplc,attr"`
- Tentative string `xml:"tentative,attr"`
- Start TextVal `xml:"start"`
- NumFmt TextVal `xml:"numFmt"`
- LvlText TextVal `xml:"lvlText"`
- LvlJc TextVal `xml:"lvlJc"`
- PPr struct {
- Text string `xml:",chardata"`
- Ind struct {
- Text string `xml:",chardata"`
- Left string `xml:"left,attr"`
- Hanging string `xml:"hanging,attr"`
- } `xml:"ind"`
- } `xml:"pPr"`
- RPr struct {
- Text string `xml:",chardata"`
- U struct {
- Text string `xml:",chardata"`
- Val string `xml:"val,attr"`
- } `xml:"u"`
- RFonts struct {
- Text string `xml:",chardata"`
- Hint string `xml:"hint,attr"`
- } `xml:"rFonts"`
- } `xml:"rPr"`
- }
- // Numbering is
- type Numbering struct {
- XMLName xml.Name `xml:"numbering"`
- Text string `xml:",chardata"`
- Wpc string `xml:"wpc,attr"`
- Cx string `xml:"cx,attr"`
- Cx1 string `xml:"cx1,attr"`
- Mc string `xml:"mc,attr"`
- O string `xml:"o,attr"`
- R string `xml:"r,attr"`
- M string `xml:"m,attr"`
- V string `xml:"v,attr"`
- Wp14 string `xml:"wp14,attr"`
- Wp string `xml:"wp,attr"`
- W10 string `xml:"w10,attr"`
- W string `xml:"w,attr"`
- W14 string `xml:"w14,attr"`
- W15 string `xml:"w15,attr"`
- W16se string `xml:"w16se,attr"`
- Wpg string `xml:"wpg,attr"`
- Wpi string `xml:"wpi,attr"`
- Wne string `xml:"wne,attr"`
- Wps string `xml:"wps,attr"`
- Ignorable string `xml:"Ignorable,attr"`
- AbstractNum []struct {
- Text string `xml:",chardata"`
- AbstractNumID string `xml:"abstractNumId,attr"`
- RestartNumberingAfterBreak string `xml:"restartNumberingAfterBreak,attr"`
- Nsid TextVal `xml:"nsid"`
- MultiLevelType TextVal `xml:"multiLevelType"`
- Tmpl TextVal `xml:"tmpl"`
- Lvl []NumberingLvl `xml:"lvl"`
- } `xml:"abstractNum"`
- Num []struct {
- Text string `xml:",chardata"`
- NumID string `xml:"numId,attr"`
- AbstractNumID TextVal `xml:"abstractNumId"`
- } `xml:"num"`
- }
- type file struct {
- rels Relationships
- num Numbering
- r *zip.ReadCloser
- embed bool
- list map[string]int
- name string
- }
- // Node is
- type Node struct {
- XMLName xml.Name
- Attrs []xml.Attr `xml:"-"`
- Content []byte `xml:",innerxml"`
- Nodes []Node `xml:",any"`
- }
- // UnmarshalXML is
- func (n *Node) UnmarshalXML(d *xml.Decoder, start xml.StartElement) error {
- n.Attrs = start.Attr
- type node Node
- return d.DecodeElement((*node)(n), &start)
- }
- func escape(s, set string) string {
- replacer := []string{}
- for _, r := range []rune(set) {
- rs := string(r)
- replacer = append(replacer, rs, `\`+rs)
- }
- return strings.NewReplacer(replacer...).Replace(s)
- }
- func (zf *file) extract(rel *Relationship, w io.Writer) error {
- err := os.MkdirAll(
- filepath.Join("uploads",
- strings.TrimSuffix(zf.name, ".docx"),
- filepath.Dir(rel.Target)),
- 0755)
- if err != nil {
- return err
- }
- for _, f := range zf.r.File {
- if f.Name != "word/"+rel.Target {
- continue
- }
- rc, err := f.Open()
- if err != nil {
- return err
- }
- defer rc.Close()
- b := make([]byte, f.UncompressedSize64)
- n, err := rc.Read(b)
- if err != nil && err != io.EOF {
- return err
- }
- if zf.embed {
- fmt.Fprintf(w, "",
- base64.StdEncoding.EncodeToString(b[:n]))
- } else {
- err = ioutil.WriteFile(
- filepath.Join("uploads",
- strings.TrimSuffix(zf.name, ".docx"),
- rel.Target),
- b, 0644)
- if err != nil {
- return err
- }
- fmt.Fprintf(w, "", "/"+filepath.Join(
- "uploads",
- strings.TrimSuffix(zf.name, ".docx"),
- escape(rel.Target, "()")))
- }
- break
- }
- return nil
- }
- func attr(attrs []xml.Attr, name string) (string, bool) {
- for _, attr := range attrs {
- if attr.Name.Local == name {
- return attr.Value, true
- }
- }
- return "", false
- }
- func (zf *file) walk(node *Node, w io.Writer) error {
- switch node.XMLName.Local {
- case "hyperlink":
- fmt.Fprint(w, "[")
- var cbuf bytes.Buffer
- for _, n := range node.Nodes {
- if err := zf.walk(&n, &cbuf); err != nil {
- return err
- }
- }
- fmt.Fprint(w, escape(cbuf.String(), "[]"))
- fmt.Fprint(w, "]")
- fmt.Fprint(w, "(")
- if id, ok := attr(node.Attrs, "id"); ok {
- for _, rel := range zf.rels.Relationship {
- if id == rel.ID {
- fmt.Fprint(w, escape(rel.Target, "()"))
- break
- }
- }
- }
- fmt.Fprint(w, ")")
- case "t":
- fmt.Fprint(w, string(node.Content))
- case "pPr":
- code := false
- for _, n := range node.Nodes {
- switch n.XMLName.Local {
- case "ind":
- if left, ok := attr(n.Attrs, "left"); ok {
- if i, err := strconv.Atoi(left); err == nil && i > 0 {
- fmt.Fprint(w, strings.Repeat(" ", i/360))
- }
- }
- case "pStyle":
- if val, ok := attr(n.Attrs, "val"); ok {
- if strings.HasPrefix(val, "Heading") {
- if i, err := strconv.Atoi(val[7:]); err == nil && i > 0 {
- fmt.Fprint(w, strings.Repeat("#", i)+" ")
- }
- } else if val == "Code" {
- code = true
- } else {
- if i, err := strconv.Atoi(val); err == nil && i > 0 {
- fmt.Fprint(w, strings.Repeat("#", i)+" ")
- }
- }
- }
- case "numPr":
- numID := ""
- ilvl := ""
- numFmt := ""
- start := 1
- ind := 0
- for _, nn := range n.Nodes {
- if nn.XMLName.Local == "numId" {
- if val, ok := attr(nn.Attrs, "val"); ok {
- numID = val
- }
- }
- if nn.XMLName.Local == "ilvl" {
- if val, ok := attr(nn.Attrs, "val"); ok {
- ilvl = val
- }
- }
- }
- for _, num := range zf.num.Num {
- if numID != num.NumID {
- continue
- }
- for _, abnum := range zf.num.AbstractNum {
- if abnum.AbstractNumID != num.AbstractNumID.Val {
- continue
- }
- for _, ablvl := range abnum.Lvl {
- if ablvl.Ilvl != ilvl {
- continue
- }
- if i, err := strconv.Atoi(ablvl.Start.Val); err == nil {
- start = i
- }
- if i, err := strconv.Atoi(ablvl.PPr.Ind.Left); err == nil {
- ind = i / 360
- }
- numFmt = ablvl.NumFmt.Val
- break
- }
- break
- }
- break
- }
- fmt.Fprint(w, strings.Repeat(" ", ind))
- switch numFmt {
- case "decimal", "aiueoFullWidth":
- key := fmt.Sprintf("%s:%d", numID, ind)
- cur, ok := zf.list[key]
- if !ok {
- zf.list[key] = start
- } else {
- zf.list[key] = cur + 1
- }
- fmt.Fprintf(w, "%d. ", zf.list[key])
- case "bullet":
- fmt.Fprint(w, "* ")
- }
- }
- }
- if code {
- fmt.Fprint(w, "`")
- }
- for _, n := range node.Nodes {
- if err := zf.walk(&n, w); err != nil {
- return err
- }
- }
- if code {
- fmt.Fprint(w, "`")
- }
- case "tbl":
- var rows [][]string
- for _, tr := range node.Nodes {
- if tr.XMLName.Local != "tr" {
- continue
- }
- var cols []string
- for _, tc := range tr.Nodes {
- if tc.XMLName.Local != "tc" {
- continue
- }
- var cbuf bytes.Buffer
- if err := zf.walk(&tc, &cbuf); err != nil {
- return err
- }
- cols = append(cols, strings.Replace(cbuf.String(), "\n", "", -1))
- }
- rows = append(rows, cols)
- }
- maxcol := 0
- for _, cols := range rows {
- if len(cols) > maxcol {
- maxcol = len(cols)
- }
- }
- widths := make([]int, maxcol)
- for _, row := range rows {
- for i := 0; i < maxcol; i++ {
- if i < len(row) {
- width := runewidth.StringWidth(row[i])
- if widths[i] < width {
- widths[i] = width
- }
- }
- }
- }
- for i, row := range rows {
- if i == 0 {
- for j := 0; j < maxcol; j++ {
- fmt.Fprint(w, "|")
- fmt.Fprint(w, strings.Repeat(" ", widths[j]))
- }
- fmt.Fprint(w, "|\n")
- for j := 0; j < maxcol; j++ {
- fmt.Fprint(w, "|")
- fmt.Fprint(w, strings.Repeat("-", widths[j]))
- }
- fmt.Fprint(w, "|\n")
- }
- for j := 0; j < maxcol; j++ {
- fmt.Fprint(w, "|")
- if j < len(row) {
- width := runewidth.StringWidth(row[j])
- fmt.Fprint(w, escape(row[j], "|"))
- fmt.Fprint(w, strings.Repeat(" ", widths[j]-width))
- } else {
- fmt.Fprint(w, strings.Repeat(" ", widths[j]))
- }
- }
- fmt.Fprint(w, "|\n")
- }
- fmt.Fprint(w, "\n")
- case "r":
- bold := false
- italic := false
- strike := false
- for _, n := range node.Nodes {
- if n.XMLName.Local != "rPr" {
- continue
- }
- for _, nn := range n.Nodes {
- switch nn.XMLName.Local {
- case "b":
- bold = true
- case "i":
- italic = true
- case "strike":
- strike = true
- }
- }
- }
- if strike {
- fmt.Fprint(w, "~~")
- }
- if bold {
- fmt.Fprint(w, "**")
- }
- if italic {
- fmt.Fprint(w, "*")
- }
- var cbuf bytes.Buffer
- for _, n := range node.Nodes {
- if err := zf.walk(&n, &cbuf); err != nil {
- return err
- }
- }
- fmt.Fprint(w, escape(cbuf.String(), `*~\`))
- if italic {
- fmt.Fprint(w, "*")
- }
- if bold {
- fmt.Fprint(w, "**")
- }
- if strike {
- fmt.Fprint(w, "~~")
- }
- case "p":
- for _, n := range node.Nodes {
- if err := zf.walk(&n, w); err != nil {
- return err
- }
- }
- fmt.Fprintln(w)
- case "blip":
- if id, ok := attr(node.Attrs, "embed"); ok {
- for _, rel := range zf.rels.Relationship {
- if id != rel.ID {
- continue
- }
- if err := zf.extract(&rel, w); err != nil {
- return err
- }
- }
- }
- case "Fallback":
- case "txbxContent":
- var cbuf bytes.Buffer
- for _, n := range node.Nodes {
- if err := zf.walk(&n, &cbuf); err != nil {
- return err
- }
- }
- fmt.Fprintln(w, "\n```\n"+cbuf.String()+"```")
- default:
- for _, n := range node.Nodes {
- if err := zf.walk(&n, w); err != nil {
- return err
- }
- }
- }
- return nil
- }
- func readFile(f *zip.File) (*Node, error) {
- rc, err := f.Open()
- defer rc.Close()
- b, _ := ioutil.ReadAll(rc)
- if err != nil {
- return nil, err
- }
- var node Node
- err = xml.Unmarshal(b, &node)
- if err != nil {
- return nil, err
- }
- return &node, nil
- }
- func findFile(files []*zip.File, target string) *zip.File {
- for _, f := range files {
- if ok, _ := path.Match(target, f.Name); ok {
- return f
- }
- }
- return nil
- }
- func Docx2md(arg string, embed bool) (string, error) {
- r, err := zip.OpenReader(arg)
- if err != nil {
- return "", err
- }
- defer r.Close()
- var rels Relationships
- var num Numbering
- for _, f := range r.File {
- switch f.Name {
- case "word/_rels/document.xml.rels":
- rc, err := f.Open()
- defer rc.Close()
- b, _ := ioutil.ReadAll(rc)
- if err != nil {
- return "", err
- }
- err = xml.Unmarshal(b, &rels)
- if err != nil {
- return "", err
- }
- case "word/numbering.xml":
- rc, err := f.Open()
- defer rc.Close()
- b, _ := ioutil.ReadAll(rc)
- if err != nil {
- return "", err
- }
- err = xml.Unmarshal(b, &num)
- if err != nil {
- return "", err
- }
- }
- }
- f := findFile(r.File, "word/document*.xml")
- if f == nil {
- return "", errors.New("incorrect document")
- }
- node, err := readFile(f)
- if err != nil {
- return "", err
- }
- fileNames := strings.Split(arg, "/")
- fileName := fileNames[len(fileNames)-1]
- // make sure the file name
- if !strings.HasSuffix(fileName, ".docx") {
- log.Fatal("File name must end with .docx")
- }
- var buf bytes.Buffer
- zf := &file{
- r: r,
- rels: rels,
- num: num,
- embed: embed,
- list: make(map[string]int),
- name: fileName,
- }
- err = zf.walk(node, &buf)
- if err != nil {
- return "", err
- }
- return buf.String(), nil
- }
|