Bläddra i källkod

use db/id for hnsw label

Because db/id is unique, auto-increment so that old ids are not re-used.
Tienson Qin 5 månader sedan
förälder
incheckning
2c27aee09d

+ 0 - 7
deps/db/src/logseq/db/frontend/property.cljs

@@ -560,13 +560,6 @@
                                                     :cardinality :many
                                                     :public? true}
                                            :queryable? true}
-     :logseq.property.embedding/hnsw-label {:title "HNSW label"
-                                            :schema {:type :raw-number
-                                                     :public? false
-                                                     :hide? true}
-                                            :rtc {:rtc/ignore-attr-when-init-upload true
-                                                  :rtc/ignore-attr-when-init-download true
-                                                  :rtc/ignore-attr-when-syncing true}}
      :logseq.property.embedding/hnsw-label-updated-at {:title "HNSW label updated-at"
                                                        :schema {:type :datetime
                                                                 :public? false

+ 51 - 48
src/main/frontend/components/settings.cljs

@@ -1222,54 +1222,57 @@
          :succ (constantly nil)))
      [])
     [:div.panel-wrap
-     [:div.flex.flex-col.gap-2
-      [:div.it.sm:grid.sm:grid-cols-3.sm:gap-4.sm:items-start
-       [:label.block.text-sm.font-medium.leading-8.opacity-70
-        {:for "local-embedding-model"}
-        "Local embedding model"]
-       [:div.rounded-md.sm:max-w-tss.sm:col-span-2
-        [:div.flex.flex-col.gap-2
-         (shui/select
-          (cond->
-           {:on-value-change (fn [model-name]
-                               (c.m/run-task
-                                 ::load-model
-                                 (m/sp
-                                   (c.m/<?
-                                    (state/<invoke-db-worker :thread-api/vec-search-load-model repo model-name))
-                                   (set-model-info (assoc model-info :graph-text-embedding-model-name model-name))
-                                   (c.m/<?
-                                    (state/<invoke-db-worker :thread-api/vec-search-cancel-indexing repo))
-                                   (c.m/<?
-                                    (state/<invoke-db-worker :thread-api/vec-search-re-embedding-graph-data repo)))
-                                 :succ (constantly nil)))}
-            current-model
-            (assoc :value current-model))
-          (shui/select-trigger
-           {:class "h-8"}
-           (shui/select-value
-            {:placeholder "Select a model"}))
-
-          (shui/select-content
-           (shui/select-group
-            (for [model-name (:available-model-names model-info)]
-              (shui/select-item {:value model-name} model-name)))))
-
-         (when status
-           [:div.text-muted-foreground.text-sm
-            (let [{:keys [file progress loaded total]} load-model-progress]
-              (case status
-                ("progress" "download" "initiate")
-                (str "Downloading " file
-                     (when progress
-                       (util/format " %dm/%dm"
-                                    (int (/ loaded 1024 1024))
-                                    (int (/ total 1024 1024)))))
-                "done"
-                (str "Downloaded " file)
-                "ready"
-                "Model is ready  🚀"
-                nil))])]]]]]))
+     [:div.flex.flex-col.gap-2.mt-4
+      [:div.font-medium.text-muted-foreground.text-sm "Semantic search:"]
+
+      [:div.flex.flex-col.gap-2
+       [:div.it.sm:grid.sm:grid-cols-3.sm:gap-4.sm:items-start
+        [:label.block.text-sm.font-medium.leading-8.opacity-70
+         {:for "local-embedding-model"}
+         "Local embedding model"]
+        [:div.rounded-md.sm:max-w-tss.sm:col-span-2
+         [:div.flex.flex-col.gap-2
+          (shui/select
+           (cond->
+            {:on-value-change (fn [model-name]
+                                (c.m/run-task
+                                  ::load-model
+                                  (m/sp
+                                    (c.m/<?
+                                     (state/<invoke-db-worker :thread-api/vec-search-load-model repo model-name))
+                                    (set-model-info (assoc model-info :graph-text-embedding-model-name model-name))
+                                    (c.m/<?
+                                     (state/<invoke-db-worker :thread-api/vec-search-cancel-indexing repo))
+                                    (c.m/<?
+                                     (state/<invoke-db-worker :thread-api/vec-search-re-embedding-graph-data repo)))
+                                  :succ (constantly nil)))}
+             current-model
+             (assoc :value current-model))
+           (shui/select-trigger
+            {:class "h-8"}
+            (shui/select-value
+             {:placeholder "Select a model"}))
+
+           (shui/select-content
+            (shui/select-group
+             (for [model-name (:available-model-names model-info)]
+               (shui/select-item {:value model-name} model-name)))))
+
+          (when status
+            [:div.text-muted-foreground.text-sm
+             (let [{:keys [file progress loaded total]} load-model-progress]
+               (case status
+                 ("progress" "download" "initiate")
+                 (str "Downloading " file
+                      (when progress
+                        (util/format " %d/%dm"
+                                     (int (/ loaded 1024 1024))
+                                     (int (/ total 1024 1024)))))
+                 "done"
+                 (str "Downloaded " file)
+                 "ready"
+                 "Model is ready  🚀"
+                 nil))])]]]]]]))
 
 (rum/defcs ^:large-vars/cleanup-todo settings
   < (rum/local DEFAULT-ACTIVE-TAB-STATE ::active)

+ 2 - 2
src/main/frontend/inference_worker/inference_worker.cljs

@@ -36,9 +36,9 @@
 
   (text-embedding+store!
    ;; return labels(js array)
-   [_this repo text-array delete-labels replace-deleted?]
+   [_this repo text-array labels replace-deleted?]
    (p/chain
-    (js/Promise. (infer-worker.text-embedding/task--text-embedding&store! repo text-array delete-labels replace-deleted?))
+    (js/Promise. (infer-worker.text-embedding/task--text-embedding&store! repo text-array labels replace-deleted?))
     clj->js))
 
   (delete-labels

+ 12 - 5
src/main/frontend/inference_worker/text_embedding.cljs

@@ -74,7 +74,7 @@
      :size (.-size r)}))
 
 (defn- add-items
-  [^js hnsw data-coll replace-deleted?]
+  [^js hnsw data-coll labels replace-deleted?]
   (let [max-elems (.getMaxElements hnsw)
         current-count (.getCurrentCount hnsw)
         add-count (count data-coll)]
@@ -82,7 +82,15 @@
       (let [new-size (+ current-count (max (* 2 add-count) current-count))]
         (log/info :hnsw-resize {:from current-count :to new-size})
         (.resizeIndex hnsw new-size)))
-    (.addItems hnsw data-coll replace-deleted?)))
+    ;; (.addItems hnsw data-coll labels replace-deleted?)
+    (dorun
+     (mapcat
+      (fn [embedding label]
+        (assert (and embedding label) {:embedding embedding
+                                       :label label})
+        (.addPoint hnsw embedding label replace-deleted?))
+      data-coll
+      labels))))
 
 (defn delete-items
   [repo labels]
@@ -91,7 +99,7 @@
 
 (defn task--text-embedding&store!
   "return labels(js-array)"
-  [repo text-array delete-labels replace-deleted?]
+  [repo text-array labels replace-deleted?]
   (m/sp
     (when (model-loaded?)
       (let [hnsw (or (get-hnsw-index repo) (new-hnsw-index! repo))
@@ -99,9 +107,8 @@
                                                                  (c.m/<? (<text-embedding text-array)))
             data-coll (split-into-chunks data (last dims))
             _ (assert (= (count text-array) (count data-coll)))]
-        (when (seq delete-labels) (.markDeleteItems hnsw (into-array delete-labels)))
         (worker-util/profile (keyword "add-items" (str (alength data-coll)))
-                             (add-items hnsw data-coll replace-deleted?))))))
+                             (add-items hnsw data-coll labels replace-deleted?))))))
 
 (def ^:private write-index-wait-delays-flow
   (m/ap

+ 1 - 1
src/main/frontend/worker/db/migrate.cljs

@@ -355,7 +355,7 @@
    ["65.5" {:fix remove-block-order-for-tags}]
    ["65.6" {:fix update-extends-to-cardinality-many}]
    ["65.7" {:fix add-quick-add-page}]
-   ["65.8" {:properties [:logseq.property.embedding/hnsw-label :logseq.property.embedding/hnsw-label-updated-at]}]])
+   ["65.8" {:properties [:logseq.property.embedding/hnsw-label-updated-at]}]])
 
 (let [[major minor] (last (sort (map (comp (juxt :major :minor) db-schema/parse-schema-version first)
                                      schema-version->updates)))]

+ 32 - 50
src/main/frontend/worker/embedding.cljs

@@ -99,18 +99,16 @@
                (vswap! *partition-index inc))))))))
 
 (defn- labels-update-tx-data
-  [db e+updated-at-coll added-labels]
-  (assert (= (count e+updated-at-coll) (count added-labels)) [e+updated-at-coll added-labels])
+  [db e+updated-at-coll]
   (let [es (map first e+updated-at-coll)
         exist-es (set (keep
                        (fn [b] (when (:block/uuid b) (:db/id b)))
                        (d/pull-many db [:block/uuid :db/id] es)))]
-    (mapcat
-     (fn [[e updated-at] label]
+    (keep
+     (fn [[e updated-at]]
        (when (contains? exist-es e)
-         [[:db/add e :logseq.property.embedding/hnsw-label label]
-          [:db/add e :logseq.property.embedding/hnsw-label-updated-at updated-at]]))
-     e+updated-at-coll added-labels)))
+         [:db/add e :logseq.property.embedding/hnsw-label-updated-at updated-at]))
+     e+updated-at-coll)))
 
 (defn- task--update-index-info!*
   ([repo ^js infer-worker]
@@ -156,12 +154,14 @@
             (m/? (task--update-index-info!* repo infer-worker true))
             (doseq [stale-block-chunk (sequence (partition-by-text-size (get-partition-size repo)) stale-blocks)]
               (let [e+updated-at-coll (map (juxt :db/id :block/updated-at) stale-block-chunk)
-                    delete-labels (into-array (keep :logseq.property.embedding/hnsw-label stale-block-chunk))
-                    added-labels (c.m/<?
-                                  (.text-embedding+store!
-                                   infer-worker repo (into-array (map :block.temp/text-to-embedding stale-block-chunk))
-                                   delete-labels false))
-                    tx-data (labels-update-tx-data @conn e+updated-at-coll added-labels)]
+                    _ (c.m/<?
+                       (.text-embedding+store!
+                        infer-worker
+                        repo
+                        (into-array (map :block.temp/text-to-embedding stale-block-chunk))
+                        (into-array (map :db/id stale-block-chunk))
+                        false))
+                    tx-data (labels-update-tx-data @conn e+updated-at-coll)]
                 (d/transact! conn tx-data {:skip-refresh? true})
                 (m/? (task--update-index-info!* repo infer-worker true))))
             (c.m/<? (.write-index! infer-worker repo))
@@ -178,11 +178,13 @@
         (let [all-blocks (stale-block-lazy-seq @conn true)]
           (doseq [block-chunk (sequence (partition-by-text-size (get-partition-size repo)) all-blocks)]
             (let [e+updated-at-coll (map (juxt :db/id :block/updated-at) block-chunk)
-                  added-labels (c.m/<?
-                                (.text-embedding+store!
-                                 infer-worker repo (into-array (map :block.temp/text-to-embedding block-chunk))
-                                 nil false))
-                  tx-data (labels-update-tx-data @conn e+updated-at-coll added-labels)]
+                  _ (c.m/<?
+                     (.text-embedding+store!
+                      infer-worker repo
+                      (into-array (map :block.temp/text-to-embedding block-chunk))
+                      (into-array (map :db/id block-chunk))
+                      false))
+                  tx-data (labels-update-tx-data @conn e+updated-at-coll)]
               (d/transact! conn tx-data {:skip-refresh? true})
               (m/? (task--update-index-info!* repo infer-worker true)))))
         (c.m/<? (.write-index! infer-worker repo))
@@ -242,18 +244,6 @@
           (d/transact! conn [(ldb/kv :logseq.kv/graph-text-embedding-model-name model-name)])
           (log/info :loaded-model model-name))))))
 
-(defn- remove-outdated-hnsw-label!
-  [conn es]
-  (when (seq es)
-    (d/transact!
-     conn
-     (mapcat
-      (fn [e]
-        [[:db.fn/retractAttribute e :logseq.property.embedding/hnsw-label]
-         [:db.fn/retractAttribute e :logseq.property.embedding/hnsw-label-updated-at]])
-      es)
-     {:skip-refresh? true})))
-
 (defn task--search
   [repo query-string nums-neighbors]
   (m/sp
@@ -265,27 +255,19 @@
                                      (js->clj (c.m/<? (.search infer-worker repo query-string nums-neighbors)) :keywordize-keys true))
                 labels (->> (map vector distances neighbors)
                             (keep (fn [[distance label]]
-                                    (when-not (or (js/isNaN distance) (> distance 0.3))
+                                    (when-not (or (js/isNaN distance) (> distance 0.65))
                                       label))))
-                datoms (map (fn [label]
-                              (->> label
-                                   (d/datoms @conn :avet :logseq.property.embedding/hnsw-label)
-                                   (sort-by :tx >))) labels)
-                result-es (keep (comp :e first) datoms)
-                es-with-outdated-hnsw-label (map :e (mapcat next datoms))
-                blocks (map #(d/entity @conn %) result-es)]
-            (remove-outdated-hnsw-label! conn es-with-outdated-hnsw-label)
-            (pp/print-table ["id" "hnsw-label" "title"] (map #(-> %
-                                                                  (update-keys name)
-                                                                  (update-vals (fn [v]
-                                                                                 (if (and (string? v) (> (count v) 60))
-                                                                                   (str (subs v 0 60) "[TRUNCATED]")
-                                                                                   v))))
-                                                             (map #(select-keys %
-                                                                                [:db/id
-                                                                                 :block/title
-                                                                                 :logseq.property.embedding/hnsw-label])
-                                                                  blocks)))
+                blocks (map #(d/entity @conn %) labels)]
+            (pp/print-table ["id" "title"] (map #(-> %
+                                                     (update-keys name)
+                                                     (update-vals (fn [v]
+                                                                    (if (and (string? v) (> (count v) 60))
+                                                                      (str (subs v 0 60) "[TRUNCATED]")
+                                                                      v))))
+                                                (map #(select-keys %
+                                                                   [:db/id
+                                                                    :block/title])
+                                                     blocks)))
             blocks))))))
 
 (def ^:private vector-search-state-flow