extract.cljc 17 KB

123456789101112131415161718192021222324252627282930313233343536373839404142434445464748495051525354555657585960616263646566676869707172737475767778798081828384858687888990919293949596979899100101102103104105106107108109110111112113114115116117118119120121122123124125126127128129130131132133134135136137138139140141142143144145146147148149150151152153154155156157158159160161162163164165166167168169170171172173174175176177178179180181182183184185186187188189190191192193194195196197198199200201202203204205206207208209210211212213214215216217218219220221222223224225226227228229230231232233234235236237238239240241242243244245246247248249250251252253254255256257258259260261262263264265266267268269270271272273274275276277278279280281282283284285286287288289290291292293294295296297298299300301302303304305306307308309310311312313314315316317318319320321322323324325326327328329330331332333334335336337338339340341342343344345346347348349350351352
  1. (ns logseq.graph-parser.extract
  2. "Handles extraction of blocks, pages and mldoc ast in preparation for db
  3. transaction"
  4. ;; Disable clj linters since we don't support clj
  5. #?(:clj {:clj-kondo/config {:linters {:unresolved-namespace {:level :off}
  6. :unresolved-symbol {:level :off}}}})
  7. (:require [clojure.set :as set]
  8. [clojure.string :as string]
  9. [clojure.walk :as walk]
  10. [datascript.core :as d]
  11. [logseq.graph-parser.text :as text]
  12. [logseq.common.util :as common-util]
  13. [logseq.graph-parser.mldoc :as gp-mldoc]
  14. [logseq.graph-parser.block :as gp-block]
  15. [logseq.graph-parser.property :as gp-property]
  16. [logseq.common.config :as common-config]
  17. #?(:org.babashka/nbb [logseq.common.log :as log]
  18. :default [lambdaisland.glogi :as log])
  19. [logseq.graph-parser.whiteboard :as gp-whiteboard]
  20. [logseq.db :as ldb]))
  21. (defn- filepath->page-name
  22. [filepath]
  23. (when-let [file-name (last (string/split filepath #"/"))]
  24. (let [result (first (common-util/split-last "." file-name))
  25. ext (string/lower-case (common-util/get-file-ext filepath))]
  26. (if (or (common-config/mldoc-support? ext) (= "edn" ext))
  27. (common-util/safe-decode-uri-component (string/replace result "." "/"))
  28. result))))
  29. (defn- path->file-name
  30. ;; Only for internal paths, as they are converted to POXIS already
  31. ;; https://github.com/logseq/logseq/blob/48b8e54e0fdd8fbd2c5d25b7f1912efef8814714/deps/graph-parser/src/logseq/graph_parser/extract.cljc#L32
  32. ;; Should be converted to POXIS first for external paths
  33. [path]
  34. (if (string/includes? path "/")
  35. (last (common-util/split-last "/" path))
  36. path))
  37. (defn- path->file-body
  38. [path]
  39. (when-let [file-name (path->file-name path)]
  40. (if (string/includes? file-name ".")
  41. (first (common-util/split-last "." file-name))
  42. file-name)))
  43. (defn- safe-url-decode
  44. [string]
  45. (if (string/includes? string "%")
  46. (some-> string str common-util/safe-decode-uri-component)
  47. string))
  48. (defn- decode-namespace-underlines
  49. "Decode namespace underlines to slashed;
  50. If continuous underlines, only decode at start;
  51. Having empty namespace is invalid."
  52. [string]
  53. (string/replace string "___" "/"))
  54. (defn- make-valid-namespaces
  55. "Remove those empty namespaces from title to make it a valid page name."
  56. [title]
  57. (->> (string/split title "/")
  58. (remove empty?)
  59. (string/join "/")))
  60. (defn- tri-lb-title-parsing
  61. "Parsing file name under the new file name format
  62. Avoid calling directly"
  63. [file-name]
  64. (some-> file-name
  65. (decode-namespace-underlines)
  66. (string/replace common-util/url-encoded-pattern safe-url-decode)
  67. (make-valid-namespaces)))
  68. ;; Keep for backward compatibility
  69. ;; Rule of dir-ver 0
  70. ;; Source: https://github.com/logseq/logseq/blob/e7110eea6790eda5861fdedb6b02c2a78b504cd9/deps/graph-parser/src/logseq/graph_parser/extract.cljc#L35
  71. (defn- legacy-title-parsing
  72. [file-name-body]
  73. (let [title (string/replace file-name-body "." "/")]
  74. (or (common-util/safe-decode-uri-component title) title)))
  75. (defn title-parsing
  76. "Convert file name in the given file name format to page title"
  77. [file-name-body filename-format]
  78. (case filename-format
  79. :triple-lowbar (tri-lb-title-parsing file-name-body)
  80. (legacy-title-parsing file-name-body)))
  81. (defn- get-page-name
  82. "Get page name with overridden order of
  83. `title::` property
  84. file name parsing
  85. first block content
  86. note: `page-name-order` is deprecated on Apr. 2021
  87. uri-encoded? - since paths on mobile are uri-encoded, need to decode them first
  88. filename-format - the format used to parse file name
  89. "
  90. [file-path ast uri-encoded? filename-format]
  91. ;; headline
  92. (let [ast (map first ast)
  93. file (if uri-encoded? (js/decodeURI file-path) file-path)]
  94. ;; check backward compatibility?
  95. ;; FIXME: use pre-config dir
  96. (if (string/starts-with? file "pages/contents.")
  97. "Contents"
  98. (let [first-block (last (first (filter gp-block/heading-block? ast)))
  99. property-name (when (contains? #{"Properties" "Property_Drawer"} (ffirst ast))
  100. (let [properties-ast (second (first ast))
  101. properties (zipmap (map (comp keyword string/lower-case first) properties-ast)
  102. (map second properties-ast))]
  103. (:title properties)))
  104. first-block-name (let [title (last (first (:title first-block)))]
  105. (and first-block
  106. (string? title)
  107. title))
  108. file-name (when-let [result (path->file-body file)]
  109. (if (common-config/mldoc-support? (common-util/get-file-ext file))
  110. (title-parsing result filename-format)
  111. result))]
  112. (or property-name
  113. file-name
  114. first-block-name)))))
  115. (defn- extract-page-alias-and-tags
  116. [page-m page-name properties]
  117. (let [alias (:alias properties)
  118. alias' (if (coll? alias) alias [(str alias)])
  119. aliases (and alias'
  120. (seq (remove #(or (= page-name (common-util/page-name-sanity-lc %))
  121. (string/blank? %)) ;; disable blank alias
  122. alias')))
  123. aliases' (keep
  124. (fn [alias]
  125. (let [page-name (common-util/page-name-sanity-lc alias)]
  126. {:block/name page-name
  127. :block/title alias}))
  128. aliases)
  129. result (cond-> page-m
  130. (seq aliases')
  131. (assoc :block/alias aliases')
  132. (:tags properties)
  133. (assoc :block/tags (let [tags (:tags properties)
  134. tags (if (coll? tags) tags [(str tags)])
  135. tags (remove string/blank? tags)]
  136. (map (fn [tag] {:block/name (common-util/page-name-sanity-lc tag)
  137. :block/title tag})
  138. tags))))]
  139. (update result :block/properties #(apply dissoc % gp-property/editable-linkable-built-in-properties))))
  140. (defn- build-page-map
  141. [properties invalid-properties properties-text-values file page page-name {:keys [date-formatter db from-page]}]
  142. (let [[*valid-properties *invalid-properties]
  143. ((juxt filter remove)
  144. (fn [[k _v]] (gp-property/valid-property-name? (str k))) properties)
  145. valid-properties (into {} *valid-properties)
  146. invalid-properties (set (->> (map (comp name first) *invalid-properties)
  147. (concat invalid-properties)))
  148. page-m (->
  149. (common-util/remove-nils-non-nested
  150. (assoc
  151. (gp-block/page-name->map page db true date-formatter
  152. :from-page from-page)
  153. :block/file {:file/path (common-util/path-normalize file)}))
  154. (extract-page-alias-and-tags page-name properties))]
  155. (cond->
  156. page-m
  157. (seq valid-properties)
  158. (assoc :block/properties valid-properties
  159. :block/properties-text-values (select-keys properties-text-values (keys valid-properties)))
  160. (seq invalid-properties)
  161. (assoc :block/invalid-properties invalid-properties))))
  162. (defn- attach-block-ids-if-match
  163. "If block-ids are provided and match the number of blocks, attach them to blocks
  164. If block-ids are provided but don't match the number of blocks, WARN and ignore
  165. If block-ids are not provided (nil), just ignore"
  166. [block-ids blocks]
  167. (or (when block-ids
  168. (if (= (count block-ids) (count blocks))
  169. (mapv (fn [block-id block]
  170. (if (some? block-id)
  171. (assoc block :block/uuid (uuid block-id))
  172. block))
  173. block-ids blocks)
  174. (log/error :gp-extract/attach-block-ids-not-match "attach-block-ids-if-match: block-ids provided, but doesn't match the number of blocks, ignoring")))
  175. blocks))
  176. ;; TODO: performance improvement
  177. (defn- extract-pages-and-blocks
  178. "uri-encoded? - if is true, apply URL decode on the file path
  179. options -
  180. :extracted-block-ids - An atom that contains all block ids that have been extracted in the current page (not yet saved to db)
  181. :resolve-uuid-fn - Optional fn which is called to resolve uuids of each block. Enables diff-merge
  182. (2 ways diff) based uuid resolution upon external editing.
  183. returns a list of the uuids, given the receiving ast, or nil if not able to resolve.
  184. Implemented in file-common-handler/diff-merge-uuids for IoC
  185. Called in gp-extract/extract as AST is being parsed and properties are extracted there"
  186. [format ast properties file content {:keys [date-formatter db filename-format extracted-block-ids resolve-uuid-fn]
  187. :or {extracted-block-ids (atom #{})
  188. resolve-uuid-fn (constantly nil)}
  189. :as options}]
  190. (assert db "Datascript DB is required")
  191. (try
  192. (let [db-based? (ldb/db-based-graph? db)
  193. page (get-page-name file ast false filename-format)
  194. [page page-name _journal-day] (gp-block/convert-page-if-journal page date-formatter)
  195. options' (assoc options :page-name page-name)
  196. ;; In case of diff-merge (2way) triggered, use the uuids to override the ones extracted from the AST
  197. override-uuids (resolve-uuid-fn format ast content options')
  198. blocks (->> (gp-block/extract-blocks ast content format options')
  199. (attach-block-ids-if-match override-uuids)
  200. (mapv #(gp-block/fix-block-id-if-duplicated! db page-name extracted-block-ids %))
  201. ;; FIXME: use page uuid
  202. (gp-block/with-parent-and-order {:block/name page-name})
  203. (vec))
  204. ref-pages (atom #{})
  205. blocks (map (fn [block]
  206. (if (contains? (:block/type block) "macro")
  207. block
  208. (let [block-ref-pages (seq (:block/refs block))]
  209. (when block-ref-pages
  210. (swap! ref-pages set/union (set block-ref-pages)))
  211. (-> block
  212. (dissoc :ref-pages)
  213. (assoc :block/format format
  214. :block/page [:block/name page-name]
  215. :block/refs block-ref-pages)))))
  216. blocks)
  217. [properties invalid-properties properties-text-values]
  218. (if (:block/pre-block? (first blocks))
  219. [(:block/properties (first blocks))
  220. (:block/invalid-properties (first blocks))
  221. (:block/properties-text-values (first blocks))]
  222. [properties [] {}])
  223. page-map (build-page-map properties invalid-properties properties-text-values file page page-name (assoc options' :from-page page))
  224. namespace-pages (when-not db-based?
  225. (let [page (:block/title page-map)]
  226. (when (text/namespace-page? page)
  227. (->> (common-util/split-namespace-pages page)
  228. (map (fn [page]
  229. (-> (gp-block/page-name->map page db true date-formatter)
  230. (assoc :block/format format))))))))
  231. pages (->> (concat
  232. [page-map]
  233. @ref-pages
  234. namespace-pages)
  235. ;; remove block references
  236. (remove vector?)
  237. (remove nil?))
  238. pages (common-util/distinct-by :block/name pages)
  239. pages (remove nil? pages)
  240. pages (map (fn [page]
  241. (let [page-id (or (when db
  242. (:block/uuid (ldb/get-page db (:block/name page))))
  243. (d/squuid))]
  244. (assoc page :block/uuid page-id)))
  245. pages)
  246. blocks (->> (remove nil? blocks)
  247. (map (fn [b] (dissoc b :block/title :block.temp/ast-body :block/level :block/children :block/meta))))]
  248. [pages blocks])
  249. (catch :default e
  250. (log/error :exception e))))
  251. (defn extract
  252. "Extracts pages, blocks and ast from given file"
  253. [file-path content {:keys [user-config verbose] :or {verbose true} :as options}]
  254. (if (string/blank? content)
  255. []
  256. (let [format (common-util/get-format file-path)
  257. _ (when verbose (println "Parsing start: " file-path))
  258. ast (gp-mldoc/->edn content (gp-mldoc/default-config format
  259. ;; {:parse_outline_only? true}
  260. ))]
  261. (when verbose (println "Parsing finished: " file-path))
  262. (let [first-block (ffirst ast)
  263. properties (let [properties (and (gp-property/properties-ast? first-block)
  264. (->> (last first-block)
  265. (map (fn [[x y mldoc-ast]]
  266. (let [k (if (keyword? x)
  267. (subs (str x) 1)
  268. x)]
  269. [(string/lower-case k) (text/parse-property k y mldoc-ast (assoc user-config :format format))])))
  270. (into {})
  271. (walk/keywordize-keys)))]
  272. (when (and properties (seq properties))
  273. (if (:filters properties)
  274. (update properties :filters
  275. (fn [v]
  276. (string/replace (or v "") "\\" "")))
  277. properties)))
  278. [pages blocks] (extract-pages-and-blocks format ast properties file-path content options)]
  279. {:pages pages
  280. :blocks blocks
  281. :ast ast}))))
  282. (defn extract-whiteboard-edn
  283. "Extracts whiteboard page from given edn file
  284. Whiteboard page edn is a subset of page schema
  285. - it will only contain a single page (for now). The page properties are stored under :logseq.tldraw.* properties and contain 'bindings' etc
  286. - blocks will be adapted to tldraw shapes. All blocks's parent is the given page."
  287. [file content {:keys [verbose] :or {verbose true}}]
  288. (let [_ (when verbose (println "Parsing start: " file))
  289. {:keys [pages blocks]} (common-util/safe-read-string content)
  290. blocks (map
  291. (fn [block]
  292. (-> block
  293. (common-util/dissoc-in [:block/parent :block/name])
  294. ;; :block/left here for backward compatibility
  295. (common-util/dissoc-in [:block/left :block/name])))
  296. blocks)
  297. serialized-page (first pages)
  298. ;; whiteboard edn file should normally have valid :block/title, :block/name, :block/uuid
  299. page-name (-> (or (:block/name serialized-page)
  300. (filepath->page-name file))
  301. (common-util/page-name-sanity-lc))
  302. title (or (:block/title serialized-page)
  303. page-name)
  304. page-block (merge {:block/name page-name
  305. :block/title title
  306. :block/type #{"whiteboard" "page"}
  307. :block/file {:file/path (common-util/path-normalize file)}}
  308. serialized-page)
  309. page-block (gp-whiteboard/migrate-page-block page-block)
  310. blocks (->> blocks
  311. (map gp-whiteboard/migrate-shape-block)
  312. (map #(merge % (gp-whiteboard/with-whiteboard-block-props % [:block/uuid (:block/uuid page-block)]))))
  313. _ (when verbose (println "Parsing finished: " file))]
  314. {:pages (list page-block)
  315. :blocks blocks}))
  316. (defn- with-block-uuid
  317. [pages]
  318. (->> (common-util/distinct-by :block/name pages)
  319. (map (fn [page]
  320. (if (:block/uuid page)
  321. page
  322. (assoc page :block/uuid (d/squuid)))))))
  323. (defn with-ref-pages
  324. [pages blocks]
  325. (let [ref-pages (->> (mapcat :block/refs blocks)
  326. (filter :block/name))]
  327. (->> (concat pages ref-pages)
  328. (group-by :block/name)
  329. vals
  330. (map (partial apply merge))
  331. (with-block-uuid))))