Spaces:
Sleeping
Sleeping
updated asessment and solutions notebooks
Browse files- notebooks/assesment.ipynb +25 -32
- notebooks/solutions.ipynb +15 -25
notebooks/assesment.ipynb
CHANGED
@@ -292,39 +292,32 @@
|
|
292 |
"source": [
|
293 |
"# 9. Bonus 2: Transformer Embeddings UDF\n",
|
294 |
"# ======================================\n",
|
295 |
-
"
|
296 |
-
"
|
297 |
-
"\n",
|
298 |
-
"
|
299 |
-
"
|
300 |
-
"#
|
301 |
-
"
|
302 |
-
"
|
303 |
-
"#
|
304 |
-
"
|
305 |
-
"
|
306 |
-
"
|
307 |
-
"
|
308 |
-
"
|
309 |
-
"\n",
|
310 |
-
"#
|
311 |
-
"\n",
|
312 |
-
"
|
313 |
-
"#
|
314 |
-
"
|
315 |
-
"# # NOTE: verify shape (embedding might be list of lists)\n",
|
316 |
-
"# return ???\n",
|
317 |
-
"\n",
|
318 |
-
"# 9.3 Wrap that function in a PySpark UDF\n",
|
319 |
-
"# from pyspark.sql.functions import udf\n",
|
320 |
-
"# from pyspark.sql.types import ArrayType, FloatType\n",
|
321 |
-
"# udf_get_name_embedding = udf(get_name_embedding, ArrayType(FloatType()))\n",
|
322 |
-
"\n",
|
323 |
-
"# 9.4 Apply the UDF to create a new column 'NameEmbedding' in spark_merged_clean\n",
|
324 |
-
"# spark_embedded = spark_merged_clean.withColumn(\"NameEmbedding\", udf_get_name_embedding(F.col(\"Name\")))\n",
|
325 |
-
"\n",
|
326 |
-
"# spark_embedded.select(\"Name\", \"NameEmbedding\").show(truncate=False)\n"
|
327 |
]
|
|
|
|
|
|
|
|
|
|
|
328 |
}
|
329 |
],
|
330 |
"metadata": {
|
|
|
292 |
"source": [
|
293 |
"# 9. Bonus 2: Transformer Embeddings UDF\n",
|
294 |
"# ======================================\n",
|
295 |
+
"\n",
|
296 |
+
"from sentence_transformers import SentenceTransformer\n",
|
297 |
+
"from pyspark.sql.functions import udf\n",
|
298 |
+
"from pyspark.sql.types import ArrayType, FloatType\n",
|
299 |
+
"\n",
|
300 |
+
"# Load the pre-trained MiniLM sentence transformer model\n",
|
301 |
+
"model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')\n",
|
302 |
+
"\n",
|
303 |
+
"# Define a UDF to compute the embeddings\n",
|
304 |
+
"def compute_embedding(text):\n",
|
305 |
+
" '''\n",
|
306 |
+
" Your function goes here\n",
|
307 |
+
" '''\n",
|
308 |
+
" pass\n",
|
309 |
+
"\n",
|
310 |
+
"# Register the UDF in Spark\n",
|
311 |
+
"embedding_udf = None #Replace with your udf\n",
|
312 |
+
"\n",
|
313 |
+
"# Apply the UDF to compute embeddings for each document\n",
|
314 |
+
"df_with_embeddings = spark_merged_clean.withColumn('mini-lm-vectors', '...')"
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
315 |
]
|
316 |
+
},
|
317 |
+
{
|
318 |
+
"cell_type": "markdown",
|
319 |
+
"metadata": {},
|
320 |
+
"source": []
|
321 |
}
|
322 |
],
|
323 |
"metadata": {
|
notebooks/solutions.ipynb
CHANGED
@@ -265,36 +265,26 @@
|
|
265 |
"metadata": {},
|
266 |
"outputs": [],
|
267 |
"source": [
|
268 |
-
"
|
269 |
-
"from transformers import pipeline\n",
|
270 |
-
"embedding_pipeline = pipeline(\"feature-extraction\", model=\"distilbert-base-uncased\")\n",
|
271 |
-
"\n",
|
272 |
-
"# Example function to get the name embedding\n",
|
273 |
-
"def get_name_embedding(name: str):\n",
|
274 |
-
" # The pipeline will return a list of lists of floats.\n",
|
275 |
-
" # Typically shape: (1, sequence_length, hidden_size).\n",
|
276 |
-
" # We'll take the first token or perhaps average them.\n",
|
277 |
-
" output = embedding_pipeline(name)\n",
|
278 |
-
" # output[0] is shape [sequence_length, hidden_size]\n",
|
279 |
-
" # let's do a simple average across the sequence dimension:\n",
|
280 |
-
" token_embeddings = output[0]\n",
|
281 |
-
" # average across tokens:\n",
|
282 |
-
" mean_embedding = [float(sum(x) / len(x)) for x in zip(*token_embeddings)]\n",
|
283 |
-
" return mean_embedding\n",
|
284 |
-
"\n",
|
285 |
-
"# Convert this Python function to a Spark UDF\n",
|
286 |
"from pyspark.sql.functions import udf\n",
|
287 |
"from pyspark.sql.types import ArrayType, FloatType\n",
|
288 |
"\n",
|
289 |
-
"
|
|
|
290 |
"\n",
|
291 |
-
"#
|
292 |
-
"
|
293 |
-
" \
|
294 |
-
"
|
295 |
-
"
|
|
|
|
|
|
|
|
|
296 |
"\n",
|
297 |
-
"
|
|
|
|
|
298 |
]
|
299 |
}
|
300 |
],
|
|
|
265 |
"metadata": {},
|
266 |
"outputs": [],
|
267 |
"source": [
|
268 |
+
"from sentence_transformers import SentenceTransformer\n",
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
269 |
"from pyspark.sql.functions import udf\n",
|
270 |
"from pyspark.sql.types import ArrayType, FloatType\n",
|
271 |
"\n",
|
272 |
+
"# Load the pre-trained MiniLM sentence transformer model\n",
|
273 |
+
"model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')\n",
|
274 |
"\n",
|
275 |
+
"# Define a UDF to compute the embeddings\n",
|
276 |
+
"def compute_embedding(text):\n",
|
277 |
+
" return model.encode(text).tolist()\n",
|
278 |
+
"\n",
|
279 |
+
"# Register the UDF in Spark\n",
|
280 |
+
"embedding_udf = udf(compute_embedding, ArrayType(FloatType()))\n",
|
281 |
+
"\n",
|
282 |
+
"# Apply the UDF to compute embeddings for each document\n",
|
283 |
+
"df_with_embeddings = spark_merged_clean.withColumn('mini-lm-vectors', embedding_udf(spark_merged_clean['Name']))\n",
|
284 |
"\n",
|
285 |
+
"# Show the result\n",
|
286 |
+
"df_with_embeddings.head()\n",
|
287 |
+
"\n"
|
288 |
]
|
289 |
}
|
290 |
],
|