Bläddra i källkod

feat: text-encode multiple embeddings

Junyi Du 2 år sedan
förälder
incheckning
6253b54ec7
5 ändrade filer med 61 tillägg och 31 borttagningar
  1. 1 1
      libs/src/LSPlugin.ts
  2. 1 1
      package.json
  3. 15 0
      src/main/frontend/ai/vector_store.cljs
  4. 40 25
      src/main/frontend/search/semantic.cljs
  5. 4 4
      yarn.lock

+ 1 - 1
libs/src/LSPlugin.ts

@@ -331,7 +331,7 @@ export interface IPluginTextEncoderServiceHooks {
   name: string
   options?: Record<string, any>
 
-  textEncode: (text: string) => Promise<Float32Array>
+  textEncode: (text: string) => Promise<Float32Array[]>
 }
 
 /**

+ 1 - 1
package.json

@@ -95,7 +95,7 @@
         "@isomorphic-git/lightning-fs": "^4.6.0",
         "@logseq/capacitor-file-sync": "0.0.35",
         "@logseq/diff-merge": "0.2.2",
-        "@logseq/logmind": "^0.1.2",
+        "@logseq/logmind": "^0.1.5",
         "@logseq/react-tweet-embed": "1.3.1-1",
         "@radix-ui/colors": "^0.1.8",
         "@sentry/react": "^6.18.2",

+ 15 - 0
src/main/frontend/ai/vector_store.cljs

@@ -42,12 +42,27 @@
   ([store embed key data]
    (.add store embed key data)))
 
+;; (defn addmany
+;;   "Add multiple records to the vector store
+;;    - store: store handler (conn)
+;;    - embed: the vector to be added
+;;    - key: identifier for the record
+;;    - data: attached metadata for the record (notice: IPC required, so don't send big objects)
+   
+;;    Returns a promise of the vector store addition
+;;    or throw an error if the store doesn't exist"
+;;   ([store embeds key]
+;;    (.addmany store embeds key))
+;;   ([store embeds key data]
+;;    (.addmany store embeds key data)))
+
 (defn rm
   "Remove a record from the vector store
    - store: store handler (conn)
    - key: identifier for the record
    
    Returns a promise of the vector store removal
+   true for success, false for failure
    or throw an error if the store doesn't exist"
     [store key]
     (.remove store key))

+ 40 - 25
src/main/frontend/search/semantic.cljs

@@ -1,10 +1,11 @@
 (ns frontend.search.semantic
   "Browser implementation of search protocol"
   (:require ["@logseq/logmind" :refer [taskQueue]]
+            [cljs-bean.core :as bean]
+            [promesa.core :as p]
             [frontend.search.protocol :as protocol]
             [frontend.ai.vector-store :as vector-store]
             [frontend.ai.text-encoder :as text-encoder]
-            [promesa.core :as p]
             [frontend.state :as state]
             [logseq.graph-parser.util :as gp-util]))
 
@@ -27,12 +28,10 @@
     nil)
 
   (transact-blocks! [_this {:keys [blocks-to-remove-set
-                                   blocks-to-add]
-                            :as data}]
-    ;; Step 1: encoding all sentences
-    ;; Step 2: inference vec length
-    ;; Step 3: create vector store (optional)
-    ;; Setp 4: add to vec store
+                                   blocks-to-add]}]
+    ;; Step 1: create vector store handler
+    ;; Step 2: deal with blocks-to-remove-set
+    ;; Step 3: deal with blocks-to-add
     ;; {:blocks-to-remove-set #{16634}, :blocks-to-add ({:id 16634, :uuid "647dcfc7-2aba-4015-8b71-cdf73c552761", :page 12, :content "adding me 2"})}
     ;; Handling blocks to add
     (let [encoder      (state/get-semsearch-encoder)
@@ -41,28 +40,44 @@
           store-conn   (if encoder-dim
                          (vector-store/create (idstr-template-string repo) encoder-dim)
                          (throw (js/Error. (str "record modelDim is not found in options of registrated encoder " encoder-name))))
-          addtask-fn (fn [block] (.addTask taskQueue (:uuid block)
-                                       (fn [] ;; Promise factory
-                                         ;; TODO Junyi: Block Chunker
-                                         (p/let [data  {:snippet (gp-util/safe-subs (:content block) 0 20)
-                                                        :page    (:page block)
-                                                        :id      (:id block)}
-                                                 embed (text-encoder/text-encode (:content block) encoder-name)]
-                                           (vector-store/add store-conn embed (:uuid block) data)))))]
-      (mapv addtask-fn blocks-to-add)))
+          eid-del->vs (fn [eid]
+                       ;; Would replace existing promise in queue (if any)
+                       ;; If the promise is already in pending state, 
+                       ;; there's a race condition that the promise executed
+                       ;; before the pending promise is resolved
+                       (let [del->vs (fn [] ;; Promise factory
+                                       (vector-store/rm store-conn (str eid)))]
+                         (.addTask taskQueue (str eid) del->vs)))
+          block-add->vs (fn [block] 
+                       ;; Would replace the task if there is already a task with the same id in the queue
+                       ;; Here we use stringified id as key to keep consistency with the logMind type annotation
+                       (let [add->vs (fn []
+                                    (p/let [metadata  {:snippet (gp-util/safe-subs (:content block) 0 20)
+                                                       :page    (:page block)
+                                                       :id      (:id block)
+                                                       :uuid    (:uuid block)}
+                                            embeds    (text-encoder/text-encode (:content block) encoder-name)
+                                            _         (vector-store/rm store-conn (str (:id block)))
+                                            emb-add->vs   (fn [embed]
+                                                            (vector-store/add store-conn embed (str (:id block)) (bean/->js metadata)))]
+                                      (p/all (mapv emb-add->vs embeds))))]
+                         (.addTask taskQueue (str (:id block)) add->vs)))]
+      ;; Delete first, then add
+      (mapv eid-del->vs blocks-to-remove-set)
+      (mapv block-add->vs blocks-to-add)))
   
-  (transact-pages! [_this data]
-                   
-                   (vector-store/create "test" 128)
+  (transact-pages! [_this data] 
     (prn "semantic: transact-pages!") ;; TODO Junyi
     (prn data))
 
   (truncate-blocks! [_this]
-    (-> repo
-        (idstr-template-string)
-        (vector-store/reset)))
+                    (-> repo
+                        (idstr-template-string)
+                        (vector-store/reset))
+                    (.clean taskQueue))
 
   (remove-db! [_this]
-    (-> repo
-        (idstr-template-string)
-        (vector-store/reset))))
+              (-> repo
+                  (idstr-template-string)
+                  (vector-store/reset))
+              (.clean taskQueue)))

+ 4 - 4
yarn.lock

@@ -519,10 +519,10 @@
   resolved "https://registry.yarnpkg.com/@logseq/diff-merge/-/diff-merge-0.2.2.tgz#583bd8c8c66d5ff05ea70906475efaa078e839a3"
   integrity sha512-0WeKNhq8PsjvunOqNEd9aSM4tgiClwhonXgXzrQ4KYj8VoyLaEAyEWWGOAoE7mwR+aqwM+bMB4MxuNFywnUb8A==
 
-"@logseq/logmind@^0.1.2":
-  version "0.1.2"
-  resolved "https://registry.yarnpkg.com/@logseq/logmind/-/logmind-0.1.2.tgz#026eed5cc225f5df1b7d2cc63f665d46f7209c3a"
-  integrity sha512-JIoWslOW2T94YRVCk8HwwBGRZUD1kQks1v+00MHLwRBni/9nw/BjeSuEmOPhYb2WLBELRmwprqtddyQM2Kvqkw==
+"@logseq/logmind@^0.1.5":
+  version "0.1.5"
+  resolved "https://registry.yarnpkg.com/@logseq/logmind/-/logmind-0.1.5.tgz#e2c4b84df938942972553be8f35242da4d15c40d"
+  integrity sha512-ZcQmnVwpIisvtdyqO6GaEAfwbqOJbSX/FroyTBsPZcvY/T7It5VyCpNKXVSvdaC8NdhCi+xEkvX5woniUin1KA==
   dependencies:
     "@xenova/transformers" "^2.3.0"
     compromise "^14.8.0"