deagar commited on
Commit
96e2e87
·
1 Parent(s): 708c3cf

updated asessment and solutions notebooks

Browse files
Files changed (2) hide show
  1. notebooks/assesment.ipynb +25 -32
  2. notebooks/solutions.ipynb +15 -25
notebooks/assesment.ipynb CHANGED
@@ -292,39 +292,32 @@
292
  "source": [
293
  "# 9. Bonus 2: Transformer Embeddings UDF\n",
294
  "# ======================================\n",
295
- "# We'll demonstrate a simple approach using a lightweight transformer model to embed passenger names.\n",
296
- "# This is optional, but shows advanced usage of Spark UDFs.\n",
297
- "\n",
298
- "# Requirements: e.g. \"transformers\" or \"sentence-transformers\" in your environment.\n",
299
- "# from transformers import pipeline\n",
300
- "# embedding_pipeline = pipeline(\"feature-extraction\", model=\"distilbert-base-uncased\")\n",
301
- "# OR\n",
302
- "# from sentence_transformers import SentenceTransformer\n",
303
- "# model = SentenceTransformer(\"all-MiniLM-L6-v2\")\n",
304
- "\n",
305
- "# 9.1 TODO: import / load the model/pipeline\n",
306
- "# e.g.\n",
307
- "# from transformers import pipeline\n",
308
- "# embedding_pipeline = pipeline(\"feature-extraction\", model=\"distilbert-base-uncased\")\n",
309
- "\n",
310
- "# 9.2 Define a Python function that takes a passenger name (string) -> returns a list of floats\n",
311
- "\n",
312
- "# def get_name_embedding(name: str) -> List[float]:\n",
313
- "# # TODO: use embedding_pipeline or model to produce an embedding\n",
314
- "# # embedding = ?\n",
315
- "# # NOTE: verify shape (embedding might be list of lists)\n",
316
- "# return ???\n",
317
- "\n",
318
- "# 9.3 Wrap that function in a PySpark UDF\n",
319
- "# from pyspark.sql.functions import udf\n",
320
- "# from pyspark.sql.types import ArrayType, FloatType\n",
321
- "# udf_get_name_embedding = udf(get_name_embedding, ArrayType(FloatType()))\n",
322
- "\n",
323
- "# 9.4 Apply the UDF to create a new column 'NameEmbedding' in spark_merged_clean\n",
324
- "# spark_embedded = spark_merged_clean.withColumn(\"NameEmbedding\", udf_get_name_embedding(F.col(\"Name\")))\n",
325
- "\n",
326
- "# spark_embedded.select(\"Name\", \"NameEmbedding\").show(truncate=False)\n"
327
  ]
 
 
 
 
 
328
  }
329
  ],
330
  "metadata": {
 
292
  "source": [
293
  "# 9. Bonus 2: Transformer Embeddings UDF\n",
294
  "# ======================================\n",
295
+ "\n",
296
+ "from sentence_transformers import SentenceTransformer\n",
297
+ "from pyspark.sql.functions import udf\n",
298
+ "from pyspark.sql.types import ArrayType, FloatType\n",
299
+ "\n",
300
+ "# Load the pre-trained MiniLM sentence transformer model\n",
301
+ "model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')\n",
302
+ "\n",
303
+ "# Define a UDF to compute the embeddings\n",
304
+ "def compute_embedding(text):\n",
305
+ " '''\n",
306
+ " Your function goes here\n",
307
+ " '''\n",
308
+ " pass\n",
309
+ "\n",
310
+ "# Register the UDF in Spark\n",
311
+ "embedding_udf = None #Replace with your udf\n",
312
+ "\n",
313
+ "# Apply the UDF to compute embeddings for each document\n",
314
+ "df_with_embeddings = spark_merged_clean.withColumn('mini-lm-vectors', '...')"
 
 
 
 
 
 
 
 
 
 
 
 
315
  ]
316
+ },
317
+ {
318
+ "cell_type": "markdown",
319
+ "metadata": {},
320
+ "source": []
321
  }
322
  ],
323
  "metadata": {
notebooks/solutions.ipynb CHANGED
@@ -265,36 +265,26 @@
265
  "metadata": {},
266
  "outputs": [],
267
  "source": [
268
- "# Example imports (make sure 'transformers' is installed)\n",
269
- "from transformers import pipeline\n",
270
- "embedding_pipeline = pipeline(\"feature-extraction\", model=\"distilbert-base-uncased\")\n",
271
- "\n",
272
- "# Example function to get the name embedding\n",
273
- "def get_name_embedding(name: str):\n",
274
- " # The pipeline will return a list of lists of floats.\n",
275
- " # Typically shape: (1, sequence_length, hidden_size).\n",
276
- " # We'll take the first token or perhaps average them.\n",
277
- " output = embedding_pipeline(name)\n",
278
- " # output[0] is shape [sequence_length, hidden_size]\n",
279
- " # let's do a simple average across the sequence dimension:\n",
280
- " token_embeddings = output[0]\n",
281
- " # average across tokens:\n",
282
- " mean_embedding = [float(sum(x) / len(x)) for x in zip(*token_embeddings)]\n",
283
- " return mean_embedding\n",
284
- "\n",
285
- "# Convert this Python function to a Spark UDF\n",
286
  "from pyspark.sql.functions import udf\n",
287
  "from pyspark.sql.types import ArrayType, FloatType\n",
288
  "\n",
289
- "udf_get_name_embedding = udf(get_name_embedding, ArrayType(FloatType()))\n",
 
290
  "\n",
291
- "# Apply it to add a new column\n",
292
- "spark_embedded = spark_merged_clean.withColumn(\n",
293
- " \"NameEmbedding\",\n",
294
- " udf_get_name_embedding(F.col(\"Name\"))\n",
295
- ")\n",
 
 
 
 
296
  "\n",
297
- "spark_embedded.select(\"Name\", \"NameEmbedding\").show(truncate=False)\n"
 
 
298
  ]
299
  }
300
  ],
 
265
  "metadata": {},
266
  "outputs": [],
267
  "source": [
268
+ "from sentence_transformers import SentenceTransformer\n",
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
269
  "from pyspark.sql.functions import udf\n",
270
  "from pyspark.sql.types import ArrayType, FloatType\n",
271
  "\n",
272
+ "# Load the pre-trained MiniLM sentence transformer model\n",
273
+ "model = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')\n",
274
  "\n",
275
+ "# Define a UDF to compute the embeddings\n",
276
+ "def compute_embedding(text):\n",
277
+ " return model.encode(text).tolist()\n",
278
+ "\n",
279
+ "# Register the UDF in Spark\n",
280
+ "embedding_udf = udf(compute_embedding, ArrayType(FloatType()))\n",
281
+ "\n",
282
+ "# Apply the UDF to compute embeddings for each document\n",
283
+ "df_with_embeddings = spark_merged_clean.withColumn('mini-lm-vectors', embedding_udf(spark_merged_clean['Name']))\n",
284
  "\n",
285
+ "# Show the result\n",
286
+ "df_with_embeddings.head()\n",
287
+ "\n"
288
  ]
289
  }
290
  ],