使用下面的技术,可以构建不用DL的搜索引擎。
向量搜索引擎使用KD-Tree
KD-Tree 搭建以字符串向量为索引的树,以 O ( l o g n ) O(logn) O(logn) 的时间复杂度快速查找到最近的向量
代码来源:https://github.com/zhaozh10/ChatCAD/blob/main/search_engine/src/db.py
代码如下:
import os
import pickledef save_db_kdtree(path_list,token_names,data,**kwargs):from scipy.spatial import KDTreeif "db_path" in kwargs:db_path=kwargs["db_path"]else:db_path=f"search_engine/db/{kwargs['name']}"if not os.path.splitext(db_path)[1]:db_path+='.pt'# Create a KDTree objectdata=data.toarray()for v in data:if sum(v)!=0: v/=sum(v)tree = KDTree(data,copy_data=True)# Find the 5 nearest neighbors of the first point# distances, indices = tree.query(data[0], k=5)# print(path_list)db={"path_list":path_list,"token_names":token_names,"tree":tree}with open(db_path,'wb') as f:pickle.dump(db,f)def load_db_kdtree(**kwargs):from scipy.spatial import KDTreeif "db_path" in kwargs:db_path=kwargs["db_path"]else:db_path=f"search_engine/db/{kwargs['name']}"if not os.path.splitext(db_path)[1]:db_path+='.pt'# Create a KDTree objectwith open(db_path,'rb') as f:db = pickle.load(f)path_list,token_names,tree=db["path_list"],db["token_names"],db["tree"]# print(path_list)return Query_kdtree(path_list,token_names,tree)# Find the 5 nearest neighbors of the first point# distances, indices = tree.query(data[0], k=5)# db={"path_list":path_list,"token_names":token_names,"tree":tree}class Query_kdtree:def __init__(self,path_list,token_names,tree) -> None:self.path_list,self.token_names,self.tree=path_list,token_names,treedef query(self,feature_vector,k=5):if sum(feature_vector)!=0: feature_vector/=sum(feature_vector)distances, indices=self.tree.query(feature_vector,k,workers=-1)# print(len(self.path_list))return [(self.path_list[pid],distances[i]) for i,pid in enumerate(indices)]save_db=save_db_kdtree
load_db=load_db_kdtree#test pass
if __name__=="__main__":import scipy.sparse as sp# sparse_matrix = sp.lil_matrix((3, 3))save_db_kdtree([114,203],['a','b'],sp.csr_matrix([[.5,.5],[-.5,.5]]),name="try_1")q=load_db_kdtree(name="try_1")print(q.query([1,1],k=2)) # [(114, 0.7071067811865476), (203, 1.5811388300841898)]
使用TD-IDF向量作为一个文档的feature
当你有一个 (vocabs, document, library) 时,你可以用TD-IDF向量作为document的向量。
当然也可以是 (vocabs, sentence, documents).
一个sentence的TD-IDF向量就是这个句子的所有token的TD-IDF值而已。
代码如下:
# -*- coding: utf-8 -*-
import jieba
from sklearn.feature_extraction.text import TfidfVectorizer
from sklearn.linear_model import LogisticRegression
import numpy as np# 示例文本数据(中文)
corpus = ["机器学习是人工智能的一个分支","深度学习是机器学习的子领域","自然语言处理是人工智能的重要方向"
]# 自定义中文分词函数
def chinese_tokenizer(text):# 使用 jieba 进行中文分词return list(jieba.cut(text))# 加载自定义词典(可选)
# jieba.load_userdict("custom_dict.txt") # 如果有自定义词典,取消注释# 初始化 TfidfVectorizer
vectorizer = TfidfVectorizer(tokenizer=chinese_tokenizer, # 使用自定义分词器stop_words=None, # 可选:设置停用词(中文需自定义)max_features=1000 # 限制词汇表大小
)# 计算 TF-IDF 矩阵
tfidf_matrix = vectorizer.fit_transform(corpus)# 获取词汇表
vocabulary = vectorizer.get_feature_names_out()# 打印词汇表
print("词汇表(token_names):")
print(vocabulary)
print("\n")# 打印 TF-IDF 矩阵的形状
print("TF-IDF 矩阵形状 (文档数, 词汇表大小):", tfidf_matrix.shape)
print("\n")# 将稀疏矩阵转换为密集数组(便于查看具体值)
dense_matrix = tfidf_matrix.toarray()# 打印每个文档的 TF-IDF 向量
for i, doc in enumerate(corpus):print(f"文档 {i+1}:")print(doc)print("TF-IDF 向量:")print(dense_matrix[i])print("-" * 50)print("\n")# 示例:用 TF-IDF 向量训练分类模型
# 假设标签:0 表示机器学习,1 表示自然语言处理
labels = [0, 0, 1]# 训练逻辑回归分类器
clf = LogisticRegression()
clf.fit(tfidf_matrix, labels)# 测试新文本
new_text = ["人工智能的未来"]
new_tfidf = vectorizer.transform(new_text)
predicted_label = clf.predict(new_tfidf)[0]print("新文本预测标签:", predicted_label)