Pārlūkot izejas kodu

enhance: import basic pdf text annotation

Only read edn for now. Works in UI and CLI.
Also correctly keeps annotation edn from being copied since
it's no longer used
Gabriel Horner 3 mēneši atpakaļ
vecāks
revīzija
536f39cbbd

+ 3 - 2
deps/graph-parser/script/db_import.cljs

@@ -57,14 +57,15 @@
            {:size (.-length buffer)
             :checksum checksum
             :type (db-asset/asset-path->type (:path file))
-            :path (:path file)})))
+            :path (:path file)})
+    buffer))
 
 (defn- <copy-asset-file [asset-m db-graph-dir]
   (p/let [parent-dir (node-path/join db-graph-dir common-config/local-assets-dir)
           _ (fsp/mkdir parent-dir #js {:recursive true})]
     (if (:block/uuid asset-m)
       (fsp/copyFile (:path asset-m) (node-path/join parent-dir (str (:block/uuid asset-m) "." (:type asset-m))))
-      (do
+      (when-not (:pdf-annotation? asset-m)
         (println "[INFO]" "Copied asset" (pr-str (node-path/basename (:path asset-m)))
                  "by its name since it was unused.")
         (fsp/copyFile (:path asset-m) (node-path/join parent-dir (node-path/basename (:path asset-m))))))))

+ 52 - 8
deps/graph-parser/src/logseq/graph_parser/exporter.cljs

@@ -35,7 +35,8 @@
             [logseq.graph-parser.block :as gp-block]
             [logseq.graph-parser.extract :as extract]
             [logseq.graph-parser.property :as gp-property]
-            [promesa.core :as p]))
+            [promesa.core :as p]
+            [logseq.graph-parser.utf8 :as utf8]))
 
 (defn- add-missing-timestamps
   "Add updated-at or created-at timestamps if they doesn't exist"
@@ -1005,9 +1006,41 @@
           block-title
           asset-name-to-uuids))
 
+(defn- build-annotation-tx
+  "Creates annotations for a pdf asset given the asset's edn file"
+  [asset-edn-map new-asset {:keys [log-fn] :or {log-fn prn}}]
+  (let [color-text-idents
+        (->> (get-in db-property/built-in-properties [:logseq.property.pdf/hl-color :closed-values])
+             (map (juxt :value :db-ident))
+             (into {}))]
+    (mapv #(let [user-attributes
+                 {:logseq.property.pdf/hl-color (get color-text-idents (get-in % [:properties :color]))
+                  :logseq.property.pdf/hl-page (:page %)
+                  :block/title (get-in % [:content :text])}
+                 _ (when (some (comp nil? val) user-attributes)
+                     (log-fn :missing-annotation-attributes "Annotation is missing some attributes so set reasonable defaults for them"
+                             {:annotation user-attributes :asset (:block/title new-asset)}))
+                 annotation (merge
+                             ;; Reasonable defaults for user attributes
+                             {:logseq.property.pdf/hl-color :logseq.property/color.yellow
+                              :logseq.property.pdf/hl-page 1
+                              :block/title ""}
+                             user-attributes
+                             {:block/uuid (d/squuid)
+                              :block/order (db-order/gen-key)
+                              :logseq.property/ls-type :annotation
+                              :logseq.property.pdf/hl-value (dissoc % :id)
+                              :logseq.property/asset [:block/uuid (:block/uuid new-asset)]
+                              :block/tags [:logseq.class/Pdf-annotation]
+                              :block/parent [:block/uuid (:block/uuid new-asset)]
+                              :block/page :logseq.class/Asset})]
+             (prn :annotation-added! user-attributes)
+             (add-missing-timestamps annotation))
+          (get-in asset-edn-map [:edn-content :highlights]))))
+
 (defn- handle-assets-in-block
   "If a block contains assets, creates them as #Asset nodes in the Asset page and references them in the block."
-  [block {:keys [asset-links]} {:keys [assets ignored-assets]}]
+  [block {:keys [asset-links]} {:keys [assets ignored-assets]} opts]
   (if (seq asset-links)
     (let [asset-maps
           (keep
@@ -1028,12 +1061,19 @@
                                            :logseq.property.asset/size (:size asset-data)
                                            :block/title (db-asset/asset-name->title (node-path/basename asset-name))}
                                           (when-let [metadata (not-empty (common-util/safe-read-map-string (:metadata (second asset-link))))]
-                                            {:logseq.property.asset/resize-metadata metadata}))]
-                      ;;  (prn :asset-added! (node-path/basename asset-name) #_(get @assets asset-name))
+                                            {:logseq.property.asset/resize-metadata metadata}))
+                         asset-edn-path (when (= "pdf" (path/file-ext asset-name)) (string/replace-first asset-name #"(?i)\.pdf$" ".edn"))
+                         ;; Mark edn asset so it isn't treated like a normal asset later
+                         _ (when (get @assets asset-edn-path)
+                             (swap! assets assoc-in [asset-edn-path :pdf-annotation?] true))
+                         asset-tx (concat [new-asset]
+                                          (when-let [asset-edn-map (get @assets asset-edn-path)]
+                                            (build-annotation-tx asset-edn-map new-asset opts)))]
+                     (prn :asset-added! (node-path/basename asset-name))
                       ;;  (cljs.pprint/pprint asset-link)
                      (swap! assets assoc-in [asset-name :block/uuid] (:block/uuid new-block))
                      {:asset-name-uuid [asset-name (:block/uuid new-asset)]
-                      :asset new-asset}))
+                      :asset-tx asset-tx}))
                  (do
                    (swap! ignored-assets conj
                           {:reason "No asset data found for this asset path"
@@ -1041,7 +1081,7 @@
                            :location {:block (:block/title block)}})
                    nil))))
            asset-links)
-          asset-blocks (keep :asset asset-maps)
+          asset-blocks (mapcat :asset-tx asset-maps)
           asset-names-to-uuids
           (into {} (map :asset-name-uuid asset-maps))]
       (cond-> {:block
@@ -1094,7 +1134,7 @@
         {block-after-built-in-props :block deadline-properties-tx :properties-tx}
         (update-block-deadline-and-scheduled block page-names-to-uuids options)
         {block-after-assets :block :keys [asset-blocks-tx]}
-        (handle-assets-in-block block-after-built-in-props walked-ast-blocks (select-keys import-state [:assets :ignored-assets]))
+        (handle-assets-in-block block-after-built-in-props walked-ast-blocks (select-keys import-state [:assets :ignored-assets]) (select-keys options [:log-fn]))
         ;; :block/page should be [:block/page NAME]
         journal-page-created-at (some-> (:block/page block*) second journal-created-ats)
         prepared-block (cond-> block-after-assets
@@ -1793,7 +1833,11 @@
                           (sort-by :path *asset-files)
                           (range 0 (count *asset-files)))
         read-asset (fn read-asset [{:keys [path] :as file}]
-                     (-> (<read-asset-file file assets)
+                     (-> (p/let [byte-array (<read-asset-file file assets)]
+                           (when (= "edn" (path/file-ext (:path file)))
+                             (swap! assets assoc-in
+                                    [(asset-path->name path) :edn-content]
+                                    (common-util/safe-read-map-string (utf8/decode byte-array)))))
                          (p/catch
                           (fn [error]
                             (notify-user {:msg (str "Import failed to read " (pr-str path) " with error:\n" (.-message error))

+ 22 - 9
deps/graph-parser/test/logseq/graph_parser/exporter_test.cljs

@@ -103,7 +103,8 @@
            {:size (.-length buffer)
             :checksum checksum
             :type (db-asset/asset-path->type (:path file))
-            :path (:path file)})))
+            :path (:path file)})
+    buffer))
 
 ;; Copied from db-import script and tweaked for an in-memory import
 (defn- import-file-graph-to-db
@@ -119,10 +120,11 @@
                         ;; asset file options
                          :<read-asset <read-asset-file
                          :<copy-asset (fn copy-asset [m]
-                                        (when-not (:block/uuid m)
-                                          (println "[INFO]" "Asset" (pr-str (node-path/basename (:path m)))
-                                                   "does not have a :block/uuid"))
-                                        (swap! assets conj m))}
+                                        (if (:block/uuid m)
+                                          (swap! assets conj m)
+                                          (when-not (:pdf-annotation? m)
+                                            (println "[INFO]" "Asset" (pr-str (node-path/basename (:path m)))
+                                                     "does not have a :block/uuid"))))}
                         (select-keys options [:verbose]))]
     (gp-exporter/export-file-graph conn conn config-file *files options')))
 
@@ -190,7 +192,7 @@
                 (remove #(= [{:db/ident :logseq.class/Tag}] (:block/tags %)))))
         "All classes only have :logseq.class/Tag as their tag (and don't have Page)")))
 
-(deftest-async export-basic-graph-with-convert-all-tags
+(deftest-async ^:focus export-basic-graph-with-convert-all-tags
   ;; This graph will contain basic examples of different features to import
   (p/let [file-graph-dir "test/resources/exporter-test-graph"
           conn (db-test/create-conn)
@@ -206,13 +208,14 @@
 
       ;; Counts
       ;; Includes journals as property values e.g. :logseq.property/deadline
-      (is (= 27 (count (d/q '[:find ?b :where [?b :block/tags :logseq.class/Journal]] @conn))))
+      (is (= 28 (count (d/q '[:find ?b :where [?b :block/tags :logseq.class/Journal]] @conn))))
 
-      (is (= 3 (count (d/q '[:find ?b :where [?b :block/tags :logseq.class/Asset]] @conn))))
+      (is (= 4 (count (d/q '[:find ?b :where [?b :block/tags :logseq.class/Asset]] @conn))))
       (is (= 4 (count (d/q '[:find ?b :where [?b :block/tags :logseq.class/Task]] @conn))))
       (is (= 4 (count (d/q '[:find ?b :where [?b :block/tags :logseq.class/Query]] @conn))))
       (is (= 2 (count (d/q '[:find ?b :where [?b :block/tags :logseq.class/Card]] @conn))))
       (is (= 3 (count (d/q '[:find ?b :where [?b :block/tags :logseq.class/Quote-block]] @conn))))
+      (is (= 2 (count (d/q '[:find ?b :where [?b :block/tags :logseq.class/Pdf-annotation]] @conn))))
 
       ;; Properties and tags aren't included in this count as they aren't a Page
       (is (= 10
@@ -235,7 +238,8 @@
       (is (= 0 (count @(:ignored-properties import-state))) "No ignored properties")
       (is (= 0 (count @(:ignored-assets import-state))) "No ignored assets")
       (is (= 1 (count @(:ignored-files import-state))) "Ignore .edn for now")
-      (is (= 3 (count @assets))))
+      ;; (cljs.pprint/pprint @(:ignored-files import-state))
+      (is (= 5 (count @assets))))
 
     (testing "logseq files"
       (is (= ".foo {}\n"
@@ -418,6 +422,15 @@
       (is (= (d/entity @conn :logseq.class/Asset)
              (:block/page (db-test/find-block-by-content @conn "greg-popovich-thumbs-up_1704749687791_0")))
           "Imported into Asset page")
+      ;; Annotations
+      (is (= {:logseq.property.pdf/hl-color :logseq.property/color.blue
+              :logseq.property.pdf/hl-page 8
+              :block/tags [:logseq.class/Pdf-annotation]
+              :logseq.property/asset "Sina_de_Capoeria_Batizado_2025_-_Program_Itinerary_1752179325104_0"}
+             (dissoc (db-test/readable-properties (db-test/find-block-by-content @conn "Duke School"))
+                     :logseq.property.pdf/hl-value :logseq.property/ls-type))
+          "Pdf text highlight has correct properties")
+
       ;; Quotes
       (is (= {:block/tags [:logseq.class/Quote-block]
               :logseq.property.node/display-type :quote}

+ 30 - 0
deps/graph-parser/test/resources/exporter-test-graph/assets/Sina_de_Capoeria_Batizado_2025_-_Program_Itinerary_1752179325104_0.edn

@@ -0,0 +1,30 @@
+{:highlights [{:id #uuid "687022ae-dac1-42a6-9d4c-39a0dba05918",
+               :page 1,
+               :position {:bounding {:x1 131,
+                                     :y1 336,
+                                     :x2 483,
+                                     :y2 399,
+                                     :width 574.0000000000001,
+                                     :height 573.999986224},
+                          :rects (),
+                          :page 1},
+               :content {:text "", :image 1752179374600},
+               :properties {:color "yellow"}}
+              {:id #uuid "68702394-3613-4bac-85a7-28643d58237f",
+               :page 8,
+               :position {:bounding {:x1 10.680589094758034,
+                                     :y1 183.2645263671875,
+                                     :x2 119.76637782156467,
+                                     :y2 204.954345703125,
+                                     :width 574.0000000000001,
+                                     :height 573.999986224},
+                          :rects ({:x1 10.680589094758034,
+                                   :y1 183.2645263671875,
+                                   :x2 119.76637782156467,
+                                   :y2 204.954345703125,
+                                   :width 574.0000000000001,
+                                   :height 573.999986224}),
+                          :page 8},
+               :content {:text "Duke School"},
+               :properties {:color "blue"}}],
+ :extra {:page 2}}

BIN
deps/graph-parser/test/resources/exporter-test-graph/assets/Sina_de_Capoeria_Batizado_2025_-_Program_Itinerary_1752179325104_0.pdf


BIN
deps/graph-parser/test/resources/exporter-test-graph/assets/Sina_de_Capoeria_Batizado_2025_-_Program_Itinerary_1752179325104_0/1_687022ae-dac1-42a6-9d4c-39a0dba05918_1752179374600.png


+ 1 - 0
deps/graph-parser/test/resources/exporter-test-graph/journals/2025_07_10.md

@@ -0,0 +1 @@
+- ![Sina de Capoeria Batizado 2025 - Program Itinerary.pdf](../assets/Sina_de_Capoeria_Batizado_2025_-_Program_Itinerary_1752179325104_0.pdf)

+ 14 - 0
deps/graph-parser/test/resources/exporter-test-graph/pages/hls__Sina_de_Capoeria_Batizado_2025_-_Program_Itinerary_1752179325104_0.md

@@ -0,0 +1,14 @@
+file:: [Sina_de_Capoeria_Batizado_2025_-_Program_Itinerary_1752179325104_0.pdf](../assets/Sina_de_Capoeria_Batizado_2025_-_Program_Itinerary_1752179325104_0.pdf)
+file-path:: ../assets/Sina_de_Capoeria_Batizado_2025_-_Program_Itinerary_1752179325104_0.pdf
+
+- id:: 687022ae-dac1-42a6-9d4c-39a0dba05918
+  ls-type:: annotation
+  hl-page:: 1
+  hl-color:: yellow
+  hl-type:: area
+  hl-stamp:: 1752179374600
+- Duke School
+  hl-page:: 8
+  ls-type:: annotation
+  id:: 68702394-3613-4bac-85a7-28643d58237f
+  hl-color:: blue

+ 9 - 8
src/main/frontend/components/imports.cljs

@@ -351,26 +351,27 @@
 (defn- read-asset [file assets]
   (-> (.arrayBuffer (:file-object file))
       (p/then (fn [buffer]
-                (p/let [checksum (db-asset/<get-file-array-buffer-checksum buffer)]
+                (p/let [checksum (db-asset/<get-file-array-buffer-checksum buffer)
+                        byte-array (js/Uint8Array. buffer)]
                   (swap! assets assoc
                          (gp-exporter/asset-path->name (:path file))
                          {:size (.-size (:file-object file))
                           :checksum checksum
                           :type (db-asset/asset-path->type (:path file))
                           :path (:path file)
-                          ;; Save buffer to avoid reading asset twice
-                          ::array-buffer buffer}))))))
+                          ;; Save array to avoid reading asset twice
+                          ::byte-array byte-array})
+                  byte-array)))))
 
 (defn- copy-asset [repo repo-dir asset-m]
-  (-> (::array-buffer asset-m)
-      (p/then (fn [buffer]
-                (let [content (js/Uint8Array. buffer)
-                      assets-dir (path/path-join repo-dir common-config/local-assets-dir)]
+  (-> (::byte-array asset-m)
+      (p/then (fn [content]
+                (let [assets-dir (path/path-join repo-dir common-config/local-assets-dir)]
                   (p/do!
                    (fs/mkdir-if-not-exists assets-dir)
                    (if (:block/uuid asset-m)
                      (fs/write-plain-text-file! repo assets-dir (str (:block/uuid asset-m) "." (:type asset-m)) content {:skip-transact? true})
-                     (do
+                     (when-not (:pdf-annotation? asset-m)
                        (println "Copied asset" (pr-str (node-path/basename (:path asset-m)))
                                 "by its name since it was unused.")
                        (fs/write-plain-text-file! repo assets-dir (node-path/basename (:path asset-m)) content {:skip-transact? true})))))))))