NLP2025_title_search / prepare_pool.py
kaisugi's picture
update data
ffb2a92
raw
history blame contribute delete
717 Bytes
from sentence_transformers import SentenceTransformer
import numpy as np
import pandas as pd
paper_df = pd.read_csv('anlp2025.tsv', names=["pid", "title"], sep="\t")
assert len(paper_df) == 777
input_texts = [title for title in paper_df["title"].tolist()]
assert input_texts[0] == "LLMのアテンションヘッドに着目したジェイルブレイク攻撃の分析と防御手法の提案"
assert input_texts[-1] == "ニュース記事中の企業名のEntity LinkingにおけるQuestion Answeringを用いた曖昧性解消"
model = SentenceTransformer("sbintuitions/sarashina-embedding-v1-1b")
embeddings = model.encode(input_texts)
assert embeddings.shape == (777, 1792)
np.savez("anlp2025", embeddings)