{ "cells": [ { "cell_type": "code", "execution_count": 1, "metadata": {}, "outputs": [ { "name": "stderr", "output_type": "stream", "text": [ "/home/timo/rep/TextClassifier/venv/lib/python3.12/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n", " from .autonotebook import tqdm as notebook_tqdm\n" ] } ], "source": [ "import os\n", "os.chdir('..')\n", "\n", "import torch\n", "from transformers import AutoTokenizer, AutoModel\n", "from src import device" ] }, { "cell_type": "code", "execution_count": 2, "metadata": {}, "outputs": [], "source": [ "tokenizer = AutoTokenizer.from_pretrained(\"cointegrated/rubert-tiny\")\n", "model = AutoModel.from_pretrained(\"cointegrated/rubert-tiny\")\n", "\n", "model = model.to(device)" ] }, { "cell_type": "code", "execution_count": 3, "metadata": {}, "outputs": [], "source": [ "def embed_bert_cls(text, model, tokenizer):\n", " t = tokenizer(text, padding=True, truncation=True, return_tensors='pt')\n", " with torch.no_grad():\n", " model_output = model(**{k: v.to(model.device) for k, v in t.items()})\n", " embeddings = model_output.last_hidden_state[:, 0, :]\n", " embeddings = torch.nn.functional.normalize(embeddings)\n", " return embeddings[0].cpu().numpy()" ] }, { "cell_type": "code", "execution_count": 4, "metadata": {}, "outputs": [ { "name": "stdout", "output_type": "stream", "text": [ "(312,)\n" ] } ], "source": [ "print(embed_bert_cls('привет мир', model, tokenizer).shape)" ] } ], "metadata": { "kernelspec": { "display_name": "venv", "language": "python", "name": "python3" }, "language_info": { "codemirror_mode": { "name": "ipython", "version": 3 }, "file_extension": ".py", "mimetype": "text/x-python", "name": "python", "nbconvert_exporter": "python", "pygments_lexer": "ipython3", "version": "3.12.3" } }, "nbformat": 4, "nbformat_minor": 2 }