From 787c3161e354b00310d49664246425a792b254bb Mon Sep 17 00:00:00 2001 From: Coding with Peter Date: Thu, 27 Apr 2023 08:10:18 -0700 Subject: [PATCH] update vector search to use 'annoy' --- content.py | 101 +++++++++++++++++++++++++++++++---------------------- 1 file changed, 60 insertions(+), 41 deletions(-) diff --git a/content.py b/content.py index 848d476..66d5c08 100644 --- a/content.py +++ b/content.py @@ -1044,38 +1044,6 @@ def txt_clean_index(): -def search_embeddings(): - model = SentenceTransformer('all-MiniLM-L6-v2') - save_embeds = pickle.load( open( "cache/embeddings.p", "rb" ) ) - columns = list(zip(*save_embeds)) - files = columns[0] - sentences = columns[1] - embeddings = columns[2] - - print(files[:20]) - print(sentences[:20]) - print(embeddings[:20]) - - s = '' - while s != 'q': - s = input("search or 'q' to quit: ") - if s == 'q': - return - query_embedding = model.encode(s) - - # Compute the cosine similarity between the query embedding and the sentence embeddings - cosine_scores = util.cos_sim(query_embedding, embeddings) - - # Sort the sentences by their cosine similarity to the query sentence - results = sorted(zip(sentences, cosine_scores, files), key=lambda x: x[1], reverse=True) - - print(results[:5]) - - # Print the top 5 results - for i, (sentence, score, file) in enumerate(results[:5]): - print(f'Top {i+1}: {file} - {sentence} - (Score: {score})') - - from whoosh import fields, columns from whoosh.index import create_in, open_dir @@ -1207,27 +1175,77 @@ def create_search_index(): writer.commit() + +from annoy import AnnoyIndex +import random + +def test_embed(): + model = SentenceTransformer('all-MiniLM-L6-v2') + sample = "What is this world coming to? What happens in the data and the research?" + embed = model.encode(sample) + + print("\nSample sentence:", sample) + print("\nEmbedding:", embed) + print("\nEmbedding size:", len(embed)) + + def create_embeddings(): model = SentenceTransformer('all-MiniLM-L6-v2') + vecsize = 384 # sentence transformer embedding size + t = AnnoyIndex(vecsize, 'angular') files = os.listdir('cache/crawl') - output = [] - save_embeds = [] # ['file','sentence','embedding'] + output = [] # ['index', 'file','sentence'] + index = 0 + save_embeds = [] files.sort() for f in files: + print(f) m = re.match(r'https?..www\.gavilan\.edu\+(.*)\.\w\w\w\w?\.txt$',f) if m: lines = displayfile(f,1) embeddings = model.encode(lines) - print("\n-----", f) + print("\n-----", index, f) - #Print the embeddings for sentence, embedding in zip(lines, embeddings): - print("Sentence:", sentence) - #print("Embedding:", embedding) - - save_embeds.append([f,sentence,embedding]) - pickle.dump( save_embeds, open( "cache/embeddings.p", "wb" ) ) + if len(sentence.split(' ')) > 5: + print(index, "Sentence:", sentence) + print(embedding[:8]) + t.add_item(index, embedding) + output.append( [index,f,sentence] ) + index += 1 + if index > 500: + break + t.build(30) # 30 trees + t.save('cache/sentences.ann') + pickle.dump( output, open( "cache/embedding_index.p", "wb" ) ) + + + + +def search_embeddings(): + f = 384 # sentence transformer embedding size + n = 10 # how many results + + u = AnnoyIndex(f, 'angular') + u.load('cache/sentences.ann') # super fast, will just mmap the file + print(u.get_n_items(), "items in index") + model = SentenceTransformer('all-MiniLM-L6-v2') + search_index = pickle.load( open( "cache/embedding_index.p", "rb" ) ) + print(search_index) + + + s = '' + while s != 'q': + s = input("search or 'q' to quit: ") + if s == 'q': + return + query_embedding = model.encode(s) + results = u.get_nns_by_vector(query_embedding, n) + + # Print the top 5 results + for i, r in enumerate(results): + print(f'Top {i+1}: {r}, {search_index[r]}') #{file} - {sentence} - (Score: {score})') if __name__ == "__main__": @@ -1248,6 +1266,7 @@ if __name__ == "__main__": 13: ['do an index search', search_index], 14: ['do a vector search', search_embeddings], 15: ['test priority', test_priority], + 16: ['test embed', test_embed], } if len(sys.argv) > 1 and re.search(r'^\d+',sys.argv[1]):