html_parser.cljs 10 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214
  1. (ns frontend.extensions.html-parser
  2. (:require [cljs.core.match :refer [match]]
  3. [clojure.string :as string]
  4. [clojure.walk :as walk]
  5. [frontend.config :as config]
  6. [frontend.util :as util]
  7. [logseq.graph-parser.util :as gp-util]
  8. [hickory.core :as hickory]))
  9. (defonce *inside-pre? (atom false))
  10. (defn- hiccup-without-style
  11. [hiccup]
  12. (walk/postwalk (fn [f]
  13. (if (map? f)
  14. (dissoc f :style)
  15. f)) hiccup))
  16. (defn- export-hiccup
  17. [hiccup]
  18. (util/format "#+BEGIN_EXPORT hiccup\n%s\n#+END_EXPORT"
  19. (str (hiccup-without-style hiccup))))
  20. (defn ^:large-vars/cleanup-todo hiccup->doc-inner
  21. [format hiccup]
  22. (let [transform-fn (fn [hiccup]
  23. (hiccup->doc-inner format hiccup))
  24. block-pattern (config/get-block-pattern format)
  25. map-join (fn [children] (apply str (map transform-fn children)))
  26. block-transform (fn [level children]
  27. (str (apply str (repeat level block-pattern))
  28. " "
  29. (->> (map transform-fn children)
  30. (string/join " "))
  31. "\n"))
  32. emphasis-transform (fn [tag _attrs children]
  33. (let [pattern (cond
  34. (contains? #{:b :strong} tag)
  35. (config/get-bold format)
  36. (contains? #{:i :em} tag)
  37. (config/get-italic format)
  38. (contains? #{:ins} tag)
  39. (config/get-underline format)
  40. (contains? #{:del} tag)
  41. (config/get-strike-through format)
  42. (contains? #{:mark} tag)
  43. (config/get-highlight format)
  44. :else
  45. nil)]
  46. (str pattern (map-join children) pattern)))
  47. wrapper (fn [tag content]
  48. (cond
  49. (contains? #{:p :hr :ul :ol :dl :table :pre :blockquote :aside :canvas
  50. :center :figure :figcaption :fieldset :div :footer
  51. :header} tag)
  52. (str "\n\n" content "\n\n")
  53. (contains? #{:thead :tr :li} tag)
  54. (str content "\n")
  55. :else
  56. content))
  57. single-hiccup-transform
  58. (fn [x]
  59. (cond
  60. (vector? x)
  61. (let [[tag attrs & children] x
  62. result (match tag
  63. :head nil
  64. :h1 (block-transform 1 children)
  65. :h2 (block-transform 2 children)
  66. :h3 (block-transform 3 children)
  67. :h4 (block-transform 4 children)
  68. :h5 (block-transform 5 children)
  69. :h6 (block-transform 6 children)
  70. :a (let [href (:href attrs)
  71. label (map-join children)
  72. has-img-tag? (gp-util/safe-re-find #"\[:img" (str x))]
  73. (if has-img-tag?
  74. (export-hiccup x)
  75. (case format
  76. :markdown (util/format "[%s](%s)" label href)
  77. :org (util/format "[[%s][%s]]" href label)
  78. nil)))
  79. :img (let [src (:src attrs)
  80. alt (:alt attrs)]
  81. (case format
  82. :markdown (util/format "![%s](%s)" alt src)
  83. :org (util/format "[[%s][%s]]" src alt)
  84. nil))
  85. :p (util/format "%s"
  86. (map-join children))
  87. :hr (config/get-hr format)
  88. (_ :guard #(contains? #{:b :strong
  89. :i :em
  90. :ins
  91. :del
  92. :mark} %))
  93. (emphasis-transform tag attrs children)
  94. :code (if @*inside-pre?
  95. (map-join children)
  96. (let [pattern (config/get-code format)]
  97. (str " "
  98. (str pattern (first children) pattern)
  99. " ")))
  100. :pre
  101. (do
  102. (reset! *inside-pre? true)
  103. (let [content (string/trim (doall (map-join children)))]
  104. (reset! *inside-pre? false)
  105. (case format
  106. :markdown (if (util/starts-with? content "```")
  107. content
  108. (str "```\n" content "\n```"))
  109. :org (if (util/starts-with? content "#+BEGIN_SRC")
  110. content
  111. (util/format "#+BEGIN_SRC\n%s\n#+END_SRC" content))
  112. nil)))
  113. :blockquote
  114. (case format
  115. :markdown (str "> " (map-join children))
  116. :org (util/format "#+BEGIN_QUOTE\n%s\n#+END_QUOTE" (map-join children))
  117. nil)
  118. :li
  119. (str "- " (map-join children))
  120. :dt
  121. (case format
  122. :org (str "- " (map-join children) " ")
  123. :markdown (str (map-join children) "\n")
  124. nil)
  125. :dd
  126. (case format
  127. :markdown (str ": " (map-join children) "\n")
  128. :org (str ":: " (map-join children) "\n")
  129. nil)
  130. :thead
  131. (case format
  132. :markdown (let [columns (count (last (first children)))]
  133. (str
  134. (map-join children)
  135. (str "| " (string/join " | "
  136. (repeat columns "----"))
  137. " |")))
  138. :org (let [columns (count (last (first children)))]
  139. (str
  140. (map-join children)
  141. (str "|" (string/join "+"
  142. (repeat columns "----"))
  143. "|")))
  144. nil)
  145. :tr
  146. (str "| "
  147. (->> (map transform-fn children)
  148. (string/join " | "))
  149. " |")
  150. (_ :guard #(contains? #{:aside :center :figure :figcaption :fieldset :footer :header} %))
  151. (export-hiccup x)
  152. :else (map-join children))]
  153. (wrapper tag result))
  154. (string? x)
  155. x
  156. :else
  157. (println "hiccup->doc error: " x)))
  158. result (if (vector? (first hiccup))
  159. (for [x hiccup]
  160. (single-hiccup-transform x))
  161. (single-hiccup-transform hiccup))]
  162. (apply str result)))
  163. (defn hiccup->doc
  164. [format hiccup]
  165. (let [s (hiccup->doc-inner format hiccup)]
  166. (if (string/blank? s)
  167. ""
  168. (-> s
  169. (string/trim)
  170. (string/replace "\n\n\n\n" "\n\n")
  171. (string/replace "\n\n\n" "\n\n")))))
  172. (defn html-decode-hiccup
  173. [hiccup]
  174. (walk/postwalk (fn [f]
  175. (if (string? f)
  176. (goog.string.unescapeEntities f)
  177. f)) hiccup))
  178. (defn parse
  179. [format html]
  180. (when-not (string/blank? html)
  181. (let [hiccup (hickory/as-hiccup (hickory/parse html))
  182. decoded-hiccup (html-decode-hiccup hiccup)]
  183. (hiccup->doc format decoded-hiccup))))
  184. (comment
  185. ;; | Syntax | Description | Test Text |``
  186. ;; | :--- | :----: | ---: |
  187. ;; | Header | Title | Here's this |
  188. ;; | Paragraph | Text | And more |
  189. (def img-link
  190. [:a {:href "https://www.markdownguide.org/book/", :style "box-sizing: border-box; color: rgb(0, 123, 255); text-decoration: none; background-color: transparent;"} [:img {:src "https://d33wubrfki0l68.cloudfront.net/cb41dd8e38b0543a305f9c56db89b46caa802263/25192/assets/images/book-cover.jpg", :class "card-img", :alt "Markdown Guide book cover", :style "box-sizing: border-box; vertical-align: middle; border-style: none; flex-shrink: 0; width: 205.75px; border-radius: calc(0.25rem - 1px);"}]]))