""" Load embedding models from huggingface. """ import torch from langchain.embeddings import HuggingFaceEmbeddings def get_hf_embeddings(model_name=None): """Get huggingface embedding by name.""" if model_name is None: # Some candiates # "BAAI/bge-m3" (good, though large and slow) # "BAAI/bge-base-en-v1.5" -> also good # "sentence-transformers/all-mpnet-base-v2" # "maidalun1020/bce-embedding-base_v1" # "intfloat/multilingual-e5-large" # Ref: https://huggingface.co./spaces/mteb/leaderboard # https://huggingface.co./maidalun1020/bce-embedding-base_v1 model_name = "BAAI/bge-large-en-v1.5" embeddings = HuggingFaceEmbeddings(model_name=model_name) return embeddings def get_jinaai_embeddings( model_name="jinaai/jina-embeddings-v2-base-en", device="auto" ): """Get jinaai embedding.""" # device: cpu or cuda if device == "auto": device = "cuda" if torch.cuda.is_available() else "cpu" # For jinaai. Ref: https://github.com/langchain-ai/langchain/issues/6080 from transformers import AutoModel model = AutoModel.from_pretrained( model_name, trust_remote_code=True ) # -> will yield error, need bug fixing model_name = model_name model_kwargs = {"device": device, "trust_remote_code": True} embeddings = HuggingFaceEmbeddings( model_name=model_name, model_kwargs=model_kwargs, ) return embeddings