Ver Fonte

enhance: import basic pdf text annotation

Only read edn for now. Works in UI and CLI.
Also correctly keeps annotation edn from being copied since
it's no longer used
Gabriel Horner há 5 meses atrás
pai
commit
536f39cbbd

+ 3 - 2
deps/graph-parser/script/db_import.cljs

@@ -57,14 +57,15 @@
            {:size (.-length buffer)
             :checksum checksum
             :type (db-asset/asset-path->type (:path file))
-            :path (:path file)})))
+            :path (:path file)})
+    buffer))
 
 (defn- <copy-asset-file [asset-m db-graph-dir]
   (p/let [parent-dir (node-path/join db-graph-dir common-config/local-assets-dir)
           _ (fsp/mkdir parent-dir #js {:recursive true})]
     (if (:block/uuid asset-m)
       (fsp/copyFile (:path asset-m) (node-path/join parent-dir (str (:block/uuid asset-m) "." (:type asset-m))))
-      (do
+      (when-not (:pdf-annotation? asset-m)
         (println "[INFO]" "Copied asset" (pr-str (node-path/basename (:path asset-m)))
                  "by its name since it was unused.")
         (fsp/copyFile (:path asset-m) (node-path/join parent-dir (node-path/basename (:path asset-m))))))))

+ 52 - 8
deps/graph-parser/src/logseq/graph_parser/exporter.cljs

@@ -35,7 +35,8 @@
             [logseq.graph-parser.block :as gp-block]
             [logseq.graph-parser.extract :as extract]
             [logseq.graph-parser.property :as gp-property]
-            [promesa.core :as p]))
+            [promesa.core :as p]
+            [logseq.graph-parser.utf8 :as utf8]))
 
 (defn- add-missing-timestamps
   "Add updated-at or created-at timestamps if they doesn't exist"
@@ -1005,9 +1006,41 @@
           block-title
           asset-name-to-uuids))
 
+(defn- build-annotation-tx
+  "Creates annotations for a pdf asset given the asset's edn file"
+  [asset-edn-map new-asset {:keys [log-fn] :or {log-fn prn}}]
+  (let [color-text-idents
+        (->> (get-in db-property/built-in-properties [:logseq.property.pdf/hl-color :closed-values])
+             (map (juxt :value :db-ident))
+             (into {}))]
+    (mapv #(let [user-attributes
+                 {:logseq.property.pdf/hl-color (get color-text-idents (get-in % [:properties :color]))
+                  :logseq.property.pdf/hl-page (:page %)
+                  :block/title (get-in % [:content :text])}
+                 _ (when (some (comp nil? val) user-attributes)
+                     (log-fn :missing-annotation-attributes "Annotation is missing some attributes so set reasonable defaults for them"
+                             {:annotation user-attributes :asset (:block/title new-asset)}))
+                 annotation (merge
+                             ;; Reasonable defaults for user attributes
+                             {:logseq.property.pdf/hl-color :logseq.property/color.yellow
+                              :logseq.property.pdf/hl-page 1
+                              :block/title ""}
+                             user-attributes
+                             {:block/uuid (d/squuid)
+                              :block/order (db-order/gen-key)
+                              :logseq.property/ls-type :annotation
+                              :logseq.property.pdf/hl-value (dissoc % :id)
+                              :logseq.property/asset [:block/uuid (:block/uuid new-asset)]
+                              :block/tags [:logseq.class/Pdf-annotation]
+                              :block/parent [:block/uuid (:block/uuid new-asset)]
+                              :block/page :logseq.class/Asset})]
+             (prn :annotation-added! user-attributes)
+             (add-missing-timestamps annotation))
+          (get-in asset-edn-map [:edn-content :highlights]))))
+
 (defn- handle-assets-in-block
   "If a block contains assets, creates them as #Asset nodes in the Asset page and references them in the block."
-  [block {:keys [asset-links]} {:keys [assets ignored-assets]}]
+  [block {:keys [asset-links]} {:keys [assets ignored-assets]} opts]
   (if (seq asset-links)
     (let [asset-maps
           (keep
@@ -1028,12 +1061,19 @@
                                            :logseq.property.asset/size (:size asset-data)
                                            :block/title (db-asset/asset-name->title (node-path/basename asset-name))}
                                           (when-let [metadata (not-empty (common-util/safe-read-map-string (:metadata (second asset-link))))]
-                                            {:logseq.property.asset/resize-metadata metadata}))]
-                      ;;  (prn :asset-added! (node-path/basename asset-name) #_(get @assets asset-name))
+                                            {:logseq.property.asset/resize-metadata metadata}))
+                         asset-edn-path (when (= "pdf" (path/file-ext asset-name)) (string/replace-first asset-name #"(?i)\.pdf$" ".edn"))
+                         ;; Mark edn asset so it isn't treated like a normal asset later
+                         _ (when (get @assets asset-edn-path)
+                             (swap! assets assoc-in [asset-edn-path :pdf-annotation?] true))
+                         asset-tx (concat [new-asset]
+                                          (when-let [asset-edn-map (get @assets asset-edn-path)]
+                                            (build-annotation-tx asset-edn-map new-asset opts)))]
+                     (prn :asset-added! (node-path/basename asset-name))
                       ;;  (cljs.pprint/pprint asset-link)
                      (swap! assets assoc-in [asset-name :block/uuid] (:block/uuid new-block))
                      {:asset-name-uuid [asset-name (:block/uuid new-asset)]
-                      :asset new-asset}))
+                      :asset-tx asset-tx}))
                  (do
                    (swap! ignored-assets conj
                           {:reason "No asset data found for this asset path"
@@ -1041,7 +1081,7 @@
                            :location {:block (:block/title block)}})
                    nil))))
            asset-links)
-          asset-blocks (keep :asset asset-maps)
+          asset-blocks (mapcat :asset-tx asset-maps)
           asset-names-to-uuids
           (into {} (map :asset-name-uuid asset-maps))]
       (cond-> {:block
@@ -1094,7 +1134,7 @@
         {block-after-built-in-props :block deadline-properties-tx :properties-tx}
         (update-block-deadline-and-scheduled block page-names-to-uuids options)
         {block-after-assets :block :keys [asset-blocks-tx]}
-        (handle-assets-in-block block-after-built-in-props walked-ast-blocks (select-keys import-state [:assets :ignored-assets]))
+        (handle-assets-in-block block-after-built-in-props walked-ast-blocks (select-keys import-state [:assets :ignored-assets]) (select-keys options [:log-fn]))
         ;; :block/page should be [:block/page NAME]
         journal-page-created-at (some-> (:block/page block*) second journal-created-ats)
         prepared-block (cond-> block-after-assets
@@ -1793,7 +1833,11 @@
                           (sort-by :path *asset-files)
                           (range 0 (count *asset-files)))
         read-asset (fn read-asset [{:keys [path] :as file}]
-                     (-> (<read-asset-file file assets)
+                     (-> (p/let [byte-array (<read-asset-file file assets)]
+                           (when (= "edn" (path/file-ext (:path file)))
+                             (swap! assets assoc-in
+                                    [(asset-path->name path) :edn-content]
+                                    (common-util/safe-read-map-string (utf8/decode byte-array)))))
                          (p/catch
                           (fn [error]
                             (notify-user {:msg (str "Import failed to read " (pr-str path) " with error:\n" (.-message error))

+ 22 - 9
deps/graph-parser/test/logseq/graph_parser/exporter_test.cljs

@@ -103,7 +103,8 @@
            {:size (.-length buffer)
             :checksum checksum
             :type (db-asset/asset-path->type (:path file))
-            :path (:path file)})))
+            :path (:path file)})
+    buffer))
 
 ;; Copied from db-import script and tweaked for an in-memory import
 (defn- import-file-graph-to-db
@@ -119,10 +120,11 @@
                         ;; asset file options
                          :<read-asset <read-asset-file
                          :<copy-asset (fn copy-asset [m]
-                                        (when-not (:block/uuid m)
-                                          (println "[INFO]" "Asset" (pr-str (node-path/basename (:path m)))
-                                                   "does not have a :block/uuid"))
-                                        (swap! assets conj m))}
+                                        (if (:block/uuid m)
+                                          (swap! assets conj m)
+                                          (when-not (:pdf-annotation? m)
+                                            (println "[INFO]" "Asset" (pr-str (node-path/basename (:path m)))
+                                                     "does not have a :block/uuid"))))}
                         (select-keys options [:verbose]))]
     (gp-exporter/export-file-graph conn conn config-file *files options')))
 
@@ -190,7 +192,7 @@
                 (remove #(= [{:db/ident :logseq.class/Tag}] (:block/tags %)))))
         "All classes only have :logseq.class/Tag as their tag (and don't have Page)")))
 
-(deftest-async export-basic-graph-with-convert-all-tags
+(deftest-async ^:focus export-basic-graph-with-convert-all-tags
   ;; This graph will contain basic examples of different features to import
   (p/let [file-graph-dir "test/resources/exporter-test-graph"
           conn (db-test/create-conn)
@@ -206,13 +208,14 @@
 
       ;; Counts
       ;; Includes journals as property values e.g. :logseq.property/deadline
-      (is (= 27 (count (d/q '[:find ?b :where [?b :block/tags :logseq.class/Journal]] @conn))))
+      (is (= 28 (count (d/q '[:find ?b :where [?b :block/tags :logseq.class/Journal]] @conn))))
 
-      (is (= 3 (count (d/q '[:find ?b :where [?b :block/tags :logseq.class/Asset]] @conn))))
+      (is (= 4 (count (d/q '[:find ?b :where [?b :block/tags :logseq.class/Asset]] @conn))))
       (is (= 4 (count (d/q '[:find ?b :where [?b :block/tags :logseq.class/Task]] @conn))))
       (is (= 4 (count (d/q '[:find ?b :where [?b :block/tags :logseq.class/Query]] @conn))))
       (is (= 2 (count (d/q '[:find ?b :where [?b :block/tags :logseq.class/Card]] @conn))))
       (is (= 3 (count (d/q '[:find ?b :where [?b :block/tags :logseq.class/Quote-block]] @conn))))
+      (is (= 2 (count (d/q '[:find ?b :where [?b :block/tags :logseq.class/Pdf-annotation]] @conn))))
 
       ;; Properties and tags aren't included in this count as they aren't a Page
       (is (= 10
@@ -235,7 +238,8 @@
       (is (= 0 (count @(:ignored-properties import-state))) "No ignored properties")
       (is (= 0 (count @(:ignored-assets import-state))) "No ignored assets")
       (is (= 1 (count @(:ignored-files import-state))) "Ignore .edn for now")
-      (is (= 3 (count @assets))))
+      ;; (cljs.pprint/pprint @(:ignored-files import-state))
+      (is (= 5 (count @assets))))
 
     (testing "logseq files"
       (is (= ".foo {}\n"
@@ -418,6 +422,15 @@
       (is (= (d/entity @conn :logseq.class/Asset)
              (:block/page (db-test/find-block-by-content @conn "greg-popovich-thumbs-up_1704749687791_0")))
           "Imported into Asset page")
+      ;; Annotations
+      (is (= {:logseq.property.pdf/hl-color :logseq.property/color.blue
+              :logseq.property.pdf/hl-page 8
+              :block/tags [:logseq.class/Pdf-annotation]
+              :logseq.property/asset "Sina_de_Capoeria_Batizado_2025_-_Program_Itinerary_1752179325104_0"}
+             (dissoc (db-test/readable-properties (db-test/find-block-by-content @conn "Duke School"))
+                     :logseq.property.pdf/hl-value :logseq.property/ls-type))
+          "Pdf text highlight has correct properties")
+
       ;; Quotes
       (is (= {:block/tags [:logseq.class/Quote-block]
               :logseq.property.node/display-type :quote}

+ 30 - 0
deps/graph-parser/test/resources/exporter-test-graph/assets/Sina_de_Capoeria_Batizado_2025_-_Program_Itinerary_1752179325104_0.edn

@@ -0,0 +1,30 @@
+{:highlights [{:id #uuid "687022ae-dac1-42a6-9d4c-39a0dba05918",
+               :page 1,
+               :position {:bounding {:x1 131,
+                                     :y1 336,
+                                     :x2 483,
+                                     :y2 399,
+                                     :width 574.0000000000001,
+                                     :height 573.999986224},
+                          :rects (),
+                          :page 1},
+               :content {:text "", :image 1752179374600},
+               :properties {:color "yellow"}}
+              {:id #uuid "68702394-3613-4bac-85a7-28643d58237f",
+               :page 8,
+               :position {:bounding {:x1 10.680589094758034,
+                                     :y1 183.2645263671875,
+                                     :x2 119.76637782156467,
+                                     :y2 204.954345703125,
+                                     :width 574.0000000000001,
+                                     :height 573.999986224},
+                          :rects ({:x1 10.680589094758034,
+                                   :y1 183.2645263671875,
+                                   :x2 119.76637782156467,
+                                   :y2 204.954345703125,
+                                   :width 574.0000000000001,
+                                   :height 573.999986224}),
+                          :page 8},
+               :content {:text "Duke School"},
+               :properties {:color "blue"}}],
+ :extra {:page 2}}

BIN
deps/graph-parser/test/resources/exporter-test-graph/assets/Sina_de_Capoeria_Batizado_2025_-_Program_Itinerary_1752179325104_0.pdf


BIN
deps/graph-parser/test/resources/exporter-test-graph/assets/Sina_de_Capoeria_Batizado_2025_-_Program_Itinerary_1752179325104_0/1_687022ae-dac1-42a6-9d4c-39a0dba05918_1752179374600.png


+ 1 - 0
deps/graph-parser/test/resources/exporter-test-graph/journals/2025_07_10.md

@@ -0,0 +1 @@
+- ![Sina de Capoeria Batizado 2025 - Program Itinerary.pdf](../assets/Sina_de_Capoeria_Batizado_2025_-_Program_Itinerary_1752179325104_0.pdf)

+ 14 - 0
deps/graph-parser/test/resources/exporter-test-graph/pages/hls__Sina_de_Capoeria_Batizado_2025_-_Program_Itinerary_1752179325104_0.md

@@ -0,0 +1,14 @@
+file:: [Sina_de_Capoeria_Batizado_2025_-_Program_Itinerary_1752179325104_0.pdf](../assets/Sina_de_Capoeria_Batizado_2025_-_Program_Itinerary_1752179325104_0.pdf)
+file-path:: ../assets/Sina_de_Capoeria_Batizado_2025_-_Program_Itinerary_1752179325104_0.pdf
+
+- id:: 687022ae-dac1-42a6-9d4c-39a0dba05918
+  ls-type:: annotation
+  hl-page:: 1
+  hl-color:: yellow
+  hl-type:: area
+  hl-stamp:: 1752179374600
+- Duke School
+  hl-page:: 8
+  ls-type:: annotation
+  id:: 68702394-3613-4bac-85a7-28643d58237f
+  hl-color:: blue

+ 9 - 8
src/main/frontend/components/imports.cljs

@@ -351,26 +351,27 @@
 (defn- read-asset [file assets]
   (-> (.arrayBuffer (:file-object file))
       (p/then (fn [buffer]
-                (p/let [checksum (db-asset/<get-file-array-buffer-checksum buffer)]
+                (p/let [checksum (db-asset/<get-file-array-buffer-checksum buffer)
+                        byte-array (js/Uint8Array. buffer)]
                   (swap! assets assoc
                          (gp-exporter/asset-path->name (:path file))
                          {:size (.-size (:file-object file))
                           :checksum checksum
                           :type (db-asset/asset-path->type (:path file))
                           :path (:path file)
-                          ;; Save buffer to avoid reading asset twice
-                          ::array-buffer buffer}))))))
+                          ;; Save array to avoid reading asset twice
+                          ::byte-array byte-array})
+                  byte-array)))))
 
 (defn- copy-asset [repo repo-dir asset-m]
-  (-> (::array-buffer asset-m)
-      (p/then (fn [buffer]
-                (let [content (js/Uint8Array. buffer)
-                      assets-dir (path/path-join repo-dir common-config/local-assets-dir)]
+  (-> (::byte-array asset-m)
+      (p/then (fn [content]
+                (let [assets-dir (path/path-join repo-dir common-config/local-assets-dir)]
                   (p/do!
                    (fs/mkdir-if-not-exists assets-dir)
                    (if (:block/uuid asset-m)
                      (fs/write-plain-text-file! repo assets-dir (str (:block/uuid asset-m) "." (:type asset-m)) content {:skip-transact? true})
-                     (do
+                     (when-not (:pdf-annotation? asset-m)
                        (println "Copied asset" (pr-str (node-path/basename (:path asset-m)))
                                 "by its name since it was unused.")
                        (fs/write-plain-text-file! repo assets-dir (node-path/basename (:path asset-m)) content {:skip-transact? true})))))))))