vector.py 1.4 KB

123456789101112131415161718192021222324252627282930313233343536
  1. import os
  2. import faiss
  3. from llama_index.core import StorageContext, VectorStoreIndex, load_index_from_storage
  4. from llama_index.legacy.vector_stores import FaissVectorStore
  5. from models.embedding import GLMEmbeddings
  6. from tqdm import tqdm
  7. from utils.data import split_into_chunks
  8. embed_model = GLMEmbeddings()
  9. def save_vectors(files: list[str], args):
  10. # split file into chunks
  11. nodes = []
  12. for file in tqdm(files, desc="文件切分"):
  13. nodes.extend(split_into_chunks(file, args.lines_per_chunk, args.lines_overlap, args.max_chars))
  14. # initialize vector store
  15. vector_store = FaissVectorStore(faiss_index=faiss.IndexFlatL2(embed_model.embedding_size))
  16. storage_context = StorageContext.from_defaults(vector_store=vector_store)
  17. # translate to vectors
  18. index = VectorStoreIndex(nodes=nodes, storage_context=storage_context, embed_model=embed_model)
  19. # save embedded vectors
  20. output_path = args.output_path
  21. os.makedirs(output_path, exist_ok=True)
  22. index.storage_context.persist(persist_dir=output_path)
  23. print(f"文件向量化完成,已保存至{output_path}")
  24. def load_vectors(vector_path: str):
  25. vector_store = FaissVectorStore.from_persist_dir(vector_path)
  26. storage_context = StorageContext.from_defaults(vector_store=vector_store, persist_dir=vector_path)
  27. return load_index_from_storage(storage_context=storage_context)