| 12345678910111213141516171819202122232425262728293031323334353637383940414243444546474849505152535455565758596061626364656667686970717273747576777879808182 |
- """Unit tests for embedding service."""
- import pytest
- from src.embeddings import EmbeddingService
- from src.utils import generate_text_hash
- @pytest.fixture
- def embedding_service() -> EmbeddingService:
- """Create embedding service instance."""
- return EmbeddingService()
- def test_encode_single_text(embedding_service: EmbeddingService) -> None:
- """Test encoding a single text."""
- text = "This is a test sentence for embedding"
- text_hash = generate_text_hash(text)
- embedding = embedding_service.encode(text, text_hash)
- assert isinstance(embedding, list)
- assert len(embedding) == embedding_service.embedding_dim
- assert all(isinstance(x, float) for x in embedding)
- def test_encode_batch(embedding_service: EmbeddingService) -> None:
- """Test encoding multiple texts."""
- texts = ["First text", "Second text", "Third text"]
- embeddings = embedding_service.encode_batch(texts)
- assert len(embeddings) == 3
- assert all(len(emb) == embedding_service.embedding_dim for emb in embeddings)
- def test_cosine_similarity(embedding_service: EmbeddingService) -> None:
- """Test cosine similarity calculation."""
- text1 = "The cat sat on the mat"
- text2 = "A cat was sitting on a mat"
- text3 = "Python programming language"
- hash1 = generate_text_hash(text1)
- hash2 = generate_text_hash(text2)
- hash3 = generate_text_hash(text3)
- emb1 = embedding_service.encode(text1, hash1)
- emb2 = embedding_service.encode(text2, hash2)
- emb3 = embedding_service.encode(text3, hash3)
- # Similar sentences should have high similarity
- sim_similar = embedding_service.cosine_similarity(emb1, emb2)
- assert sim_similar > 0.7
- # Dissimilar sentences should have lower similarity
- sim_different = embedding_service.cosine_similarity(emb1, emb3)
- assert sim_different < 0.5
- # Same sentence should have similarity close to 1.0
- sim_identical = embedding_service.cosine_similarity(emb1, emb1)
- assert sim_identical > 0.99
- def test_embedding_cache(embedding_service: EmbeddingService) -> None:
- """Test the embedding cache."""
- text = "This is a test sentence for the embedding cache."
- text_hash = generate_text_hash(text)
- # First call should generate and cache the embedding
- embedding1 = embedding_service.encode(text, text_hash)
- # Second call should return the cached embedding
- embedding2 = embedding_service.encode(text, text_hash)
- assert embedding1 == embedding2
- # Save and load the cache
- embedding_service.save_cache()
- embedding_service.load_cache()
- # Third call should still return the cached embedding
- embedding3 = embedding_service.encode(text, text_hash)
- assert embedding1 == embedding3
|