Spaces:
Sleeping
Sleeping
File size: 15,646 Bytes
d25a3d8 d398a54 d25a3d8 d398a54 d25a3d8 d398a54 d25a3d8 d398a54 d25a3d8 d398a54 d25a3d8 |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 209 210 211 212 213 214 215 216 217 218 219 220 221 222 223 224 225 226 227 228 229 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 270 271 272 273 274 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 327 328 329 330 331 332 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 393 394 395 396 397 398 399 400 401 402 403 404 405 406 407 408 409 410 411 412 413 414 415 416 417 418 419 420 421 422 423 424 425 426 427 428 429 430 431 432 433 434 435 436 437 438 439 440 441 442 443 444 445 446 447 448 449 450 451 452 453 454 455 456 457 458 459 460 461 462 463 464 465 466 467 468 469 470 471 472 473 474 475 476 477 478 479 480 481 482 483 484 485 486 487 488 489 490 491 492 493 494 495 496 497 498 499 500 501 502 503 504 505 506 507 508 509 510 511 512 513 514 515 516 517 518 519 520 521 522 523 524 525 526 527 528 529 530 531 532 533 534 535 536 537 538 539 540 541 542 543 544 545 546 547 548 549 550 551 552 553 554 555 556 557 558 559 560 561 562 563 564 565 566 567 568 569 570 571 572 573 574 575 576 577 578 579 580 581 582 583 584 585 586 587 588 589 590 591 592 593 594 595 596 597 598 599 600 601 |
import os
import polars as pl
import marimo
__generated_with = "0.10.15"
app = marimo.App(app_title="Polars & Hugging Face Data Exploration", css_file="../custom.css")
# =============================================================================
# Intro Cell
# =============================================================================
@app.cell
def introduction(mo):
mo.md(
r"""
# Exploring a Hugging Face Dataset with Polars
In this notebook we demonstrate how to:
- **Lazy-load** a Hugging Face dataset (all Parquet files using a recursive globbing pattern).
- **Preview** the loaded DataFrame with metadata.
- **Interactively expand** the DataFrame view.
- Explore over 30 additional examples of Polars I/O functions and DataFrame manipulations—especially for handling large text data.
**Prerequisites:**
- Install dependencies via:
```bash
pip install polars marimo
```
- Make sure your Hugging Face API token is available in the `HF_TOKEN` environment variable.

"""
)
return
# =============================================================================
# Load HF_TOKEN from the environment
# =============================================================================
@app.cell
def load_token(mo):
hf_token = os.environ.get("HF_TOKEN")
mo.md(f"""
**Hugging Face Token:** `{hf_token}`
*(Ensure that HF_TOKEN is set in your environment.)*
""")
return
# =============================================================================
# 1. Lazy-load the Dataset
# =============================================================================
@app.cell
def lazy_load_dataset(mo, pl):
# Use a recursive globbing pattern to load all Parquet files from all subdirectories.
dataset_url = "hf://datasets/cicero-im/processed_prompt1/**/*.parquet"
@mo.lazy # The mo.lazy decorator defers execution until the data is needed.
def load_dataset():
# Load all Parquet files matching the recursive pattern.
df = pl.read_parquet(dataset_url)
# --- Alternative for local JSONL files (uncomment if needed):
# df = pl.read_ndjson("/local/path/to/*.jsonl")
return df
df = load_dataset()
return df
# =============================================================================
# 2. Preview the DataFrame with Metadata
# =============================================================================
@app.cell
def preview_data(mo, lazy_load_dataset, pl):
df = lazy_load_dataset # LazyFrame returned by load_dataset
preview = mo.ui.table(df.head(), metadata=True)
mo.md(
r"""
## Data Preview
Below is a preview of the first few rows along with basic metadata.
"""
)
return preview
# =============================================================================
# 3. Expand the DataFrame for Better Visualization
# =============================================================================
@app.cell
def expand_view(mo, lazy_load_dataset, pl):
df = lazy_load_dataset
expand_button = mo.ui.button(label="Expand Dataframe")
@expand_button.on_click
def on_expand():
mo.ui.table(df, width="100%", height="auto")
mo.md(
r"""
## Expand Dataframe
Click the button below to expand the DataFrame view.
"""
)
return expand_button
# =============================================================================
# 4. Column Selection Tips (as Markdown)
# =============================================================================
@app.cell
def column_selection_tips(mo):
mo.md(
r"""
## Column Selection Tips
**Example 1: Select specific columns by name:**
```python
selected_columns_df = df.select(["column1", "column2"])
```
**Example 2: Select all columns except column 'a':**
```python
all_except_a_df = df.select(pl.exclude("a"))
```
**Example 3: Select a range of columns (e.g., from the 2nd to the 4th column):**
```python
range_columns_df = df.select(pl.col(df.columns[1:4]))
```
"""
)
return
# =============================================================================
# Additional Polars I/O and DataFrame Examples (Markdown Cells)
# =============================================================================
@app.cell
def example_1(mo):
mo.md(
r"""
### Example 1: Eagerly Read a Single Parquet File
```python
df = pl.read_parquet("hf://datasets/roneneldan/TinyStories/data/train-00000-of-00004-2d5a1467fff1081b.parquet")
```
"""
)
return
@app.cell
def example_2(mo):
mo.md(
r"""
### Example 2: Read Multiple Parquet Files Using Globbing
```python
df = pl.read_parquet("hf://datasets/roneneldan/TinyStories/data/train-*.parquet")
```
"""
)
return
@app.cell
def example_3(mo):
mo.md(
r"""
### Example 3: Lazily Scan Parquet Files with Recursive Globbing
```python
df_lazy = pl.scan_parquet("hf://datasets/cicero-im/processed_prompt1/**/*.parquet")
```
"""
)
return
@app.cell
def example_4(mo):
mo.md(
r"""
### Example 4: Read a JSON File into a DataFrame
```python
df_json = pl.read_json("data/sample.json")
```
"""
)
return
@app.cell
def example_5(mo):
mo.md(
r"""
### Example 5: Read JSON with a Specified Schema
```python
schema = {"name": pl.Utf8, "age": pl.Int64}
df_json = pl.read_json("data/sample.json", schema=schema)
```
"""
)
return
@app.cell
def example_6(mo):
mo.md(
r"""
### Example 6: Write a DataFrame to NDJSON Format
```python
df = pl.DataFrame({"foo": [1, 2, 3], "bar": [6, 7, 8]})
ndjson_str = df.write_ndjson()
print(ndjson_str)
```
"""
)
return
@app.cell
def example_7(mo):
mo.md(
r"""
### Example 7: Get the Schema of a Parquet File Without Reading Data
```python
schema = pl.read_parquet_schema("hf://datasets/roneneldan/TinyStories/data/train-00000-of-00004-2d5a1467fff1081b.parquet")
print(schema)
```
"""
)
return
@app.cell
def example_8(mo):
mo.md(
r"""
### Example 8: Scan Parquet Files with Hive Partitioning Enabled
```python
df = pl.scan_parquet("hf://datasets/myuser/my-dataset/data/**/*.parquet", hive_partitioning=True)
```
"""
)
return
@app.cell
def example_9(mo):
mo.md(
r"""
### Example 9: Lazily Scan NDJSON Files Using Globbing
```python
df_lazy = pl.scan_ndjson("data/*.jsonl")
```
"""
)
return
@app.cell
def example_10(mo):
mo.md(
r"""
### Example 10: Write a DataFrame to Partitioned Parquet Files
```python
df = pl.DataFrame({"date": ["2025-01-01", "2025-01-02"], "value": [100, 200]})
df.write_parquet("output/", partition_by=["date"])
```
"""
)
return
@app.cell
def example_11(mo):
mo.md(
r"""
### Example 11: Read JSON with Custom Inference Length
```python
df = pl.read_json("data/large_text.json", infer_schema_length=500)
```
"""
)
return
@app.cell
def example_12(mo):
mo.md(
r"""
### Example 12: Read JSON with Schema Overrides
```python
schema = {"id": pl.Int64, "text": pl.Utf8}
overrides = {"id": pl.Int32}
df = pl.read_json("data/large_text.json", schema=schema, schema_overrides=overrides)
```
"""
)
return
@app.cell
def example_13(mo):
mo.md(
r"""
### Example 13: Write a DataFrame to NDJSON and Return as String
```python
df = pl.DataFrame({"foo": [1,2,3], "bar": [4,5,6]})
ndjson_output = df.write_ndjson()
print(ndjson_output)
```
"""
)
return
@app.cell
def example_14(mo):
mo.md(
r"""
### Example 14: Scan Parquet Files with Cloud Storage Options
```python
storage_options = {"token": os.environ.get("HF_TOKEN")}
df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet", storage_options=storage_options)
```
"""
)
return
@app.cell
def example_15(mo):
mo.md(
r"""
### Example 15: Scan NDJSON Files with Cloud Storage Options
```python
storage_options = {"token": os.environ.get("HF_TOKEN")}
df_lazy = pl.scan_ndjson("hf://datasets/myuser/my-dataset/**/*.jsonl", storage_options=storage_options)
```
"""
)
return
@app.cell
def example_16(mo):
mo.md(
r"""
### Example 16: Predicate Pushdown Example
```python
df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
# Only load rows where 'value' > 100
df_filtered = df_lazy.filter(pl.col("value") > 100)
result = df_filtered.collect()
```
"""
)
return
@app.cell
def example_17(mo):
mo.md(
r"""
### Example 17: Projection Pushdown Example
```python
df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
# Only select the 'text' and 'id' columns to reduce memory footprint
df_proj = df_lazy.select(["id", "text"])
result = df_proj.collect()
```
"""
)
return
@app.cell
def example_18(mo):
mo.md(
r"""
### Example 18: Collecting a Lazy DataFrame
```python
df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
# Perform lazy operations...
result = df_lazy.collect()
print(result)
```
"""
)
return
@app.cell
def example_19(mo):
mo.md(
r"""
### Example 19: Filtering on a Large Text Column
```python
df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
# Filter rows where the 'text' column contains a long string pattern
df_filtered = df.filter(pl.col("text").str.contains("important keyword"))
print(df_filtered.head())
```
"""
)
return
@app.cell
def example_20(mo):
mo.md(
r"""
### Example 20: Using String Length on a Text Column
```python
df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
# Compute the length of text in the 'text' column
df = df.with_columns(text_length=pl.col("text").str.len())
print(df.head())
```
"""
)
return
@app.cell
def example_21(mo):
mo.md(
r"""
### Example 21: Grouping by a Large Text Field
```python
df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
grouped = df.group_by("category").agg(pl.col("text").str.len().mean().alias("avg_text_length"))
print(grouped.collect())
```
"""
)
return
@app.cell
def example_22(mo):
mo.md(
r"""
### Example 22: Joining Two DataFrames on a Common Key
```python
df1 = pl.DataFrame({"id": [1,2,3], "text": ["A", "B", "C"]})
df2 = pl.DataFrame({"id": [1,2,3], "value": [100, 200, 300]})
joined = df1.join(df2, on="id")
print(joined)
```
"""
)
return
@app.cell
def example_23(mo):
mo.md(
r"""
### Example 23: Using join_asof for Time-based Joins
```python
df1 = pl.DataFrame({
"time": pl.date_range(low="2025-01-01", high="2025-01-02", interval="1h"),
"text": ["sample text"] * 25
})
df2 = pl.DataFrame({
"time": pl.date_range(low="2025-01-01 00:30", high="2025-01-02", interval="1h"),
"value": list(range(25))
})
# Perform an asof join to match the nearest timestamp
joined = df1.sort("time").join_asof(df2.sort("time"), on="time")
print(joined)
```
"""
)
return
@app.cell
def example_24(mo):
mo.md(
r"""
### Example 24: Reading a Parquet File with Low Memory Option
```python
df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet", low_memory=True)
print(df.head())
```
"""
)
return
@app.cell
def example_25(mo):
mo.md(
r"""
### Example 25: Scanning Parquet Files with a Parallel Strategy
```python
df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet", parallel="auto")
result = df_lazy.collect()
print(result)
```
"""
)
return
@app.cell
def example_26(mo):
mo.md(
r"""
### Example 26: Reading a Large JSON File into a DataFrame
```python
df = pl.read_json("data/large_text.json", infer_schema_length=200)
print(df.head())
```
"""
)
return
@app.cell
def example_27(mo):
mo.md(
r"""
### Example 27: Using DataFrame.head() on a Large Text Dataset
```python
df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
print(df.head(10))
```
"""
)
return
@app.cell
def example_28(mo):
mo.md(
r"""
### Example 28: Using DataFrame.tail() on a Large Text Dataset
```python
df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
print(df.tail(10))
```
"""
)
return
@app.cell
def example_29(mo):
mo.md(
r"""
### Example 29: Scanning NDJSON Files with Rechunking
```python
df_lazy = pl.scan_ndjson("data/*.jsonl", rechunk=True)
result = df_lazy.collect()
print(result)
```
"""
)
return
@app.cell
def example_30(mo):
mo.md(
r"""
### Example 30: Scanning Parquet Files with Allowing Missing Columns
```python
df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet", allow_missing_columns=True)
result = df_lazy.collect()
print(result)
```
"""
)
return
# =============================================================================
# End of Notebook
# =============================================================================
@app.cell
def conclusion(mo):
mo.md(
r"""
# Conclusion
This notebook showcased:
- How to lazy-load a Hugging Face dataset using Polars with recursive globbing.
- How to preview and interactively expand the DataFrame.
- Over 30 examples covering various Polars I/O functions and DataFrame operations,
which are especially useful when working with large text data.
For more information, please refer to:
- [Polars Documentation](https://docs.pola.rs/)
- [Hugging Face Hub Documentation](https://huggingface.co./docs)
- [Marimo Notebook Documentation](https://marimo.io/)
Happy Data Exploring!
"""
)
return
if __name__ == "__main__":
app.run() |