Spaces:
Sleeping
Sleeping
import os | |
import polars as pl | |
import marimo | |
__generated_with = "0.10.15" | |
app = marimo.App(app_title="Polars & Hugging Face Data Exploration", css_file="../custom.css") | |
# ============================================================================= | |
# Intro Cell | |
# ============================================================================= | |
def introduction(mo): | |
mo.md( | |
r""" | |
# Exploring a Hugging Face Dataset with Polars | |
In this notebook we demonstrate how to: | |
- **Lazy-load** a Hugging Face dataset (all Parquet files using a recursive globbing pattern). | |
- **Preview** the loaded DataFrame with metadata. | |
- **Interactively expand** the DataFrame view. | |
- Explore over 30 additional examples of Polars I/O functions and DataFrame manipulations—especially for handling large text data. | |
**Prerequisites:** | |
- Install dependencies via: | |
```bash | |
pip install polars marimo | |
``` | |
- Make sure your Hugging Face API token is available in the `HF_TOKEN` environment variable. | |
 | |
""" | |
) | |
return | |
# ============================================================================= | |
# Load HF_TOKEN from the environment | |
# ============================================================================= | |
def load_token(mo): | |
hf_token = os.environ.get("HF_TOKEN") | |
mo.md(f""" | |
**Hugging Face Token:** `{hf_token}` | |
*(Ensure that HF_TOKEN is set in your environment.)* | |
""") | |
return | |
# ============================================================================= | |
# 1. Lazy-load the Dataset | |
# ============================================================================= | |
def lazy_load_dataset(mo, pl): | |
# Use a recursive globbing pattern to load all Parquet files from all subdirectories. | |
dataset_url = "hf://datasets/cicero-im/processed_prompt1/**/*.parquet" | |
# The mo.lazy decorator defers execution until the data is needed. | |
def load_dataset(): | |
# Load all Parquet files matching the recursive pattern. | |
df = pl.read_parquet(dataset_url) | |
# --- Alternative for local JSONL files (uncomment if needed): | |
# df = pl.read_ndjson("/local/path/to/*.jsonl") | |
return df | |
df = load_dataset() | |
return df | |
# ============================================================================= | |
# 2. Preview the DataFrame with Metadata | |
# ============================================================================= | |
def preview_data(mo, lazy_load_dataset, pl): | |
df = lazy_load_dataset # LazyFrame returned by load_dataset | |
preview = mo.ui.table(df.head(), metadata=True) | |
mo.md( | |
r""" | |
## Data Preview | |
Below is a preview of the first few rows along with basic metadata. | |
""" | |
) | |
return preview | |
# ============================================================================= | |
# 3. Expand the DataFrame for Better Visualization | |
# ============================================================================= | |
def expand_view(mo, lazy_load_dataset, pl): | |
df = lazy_load_dataset | |
expand_button = mo.ui.button(label="Expand Dataframe") | |
def on_expand(): | |
mo.ui.table(df, width="100%", height="auto") | |
mo.md( | |
r""" | |
## Expand Dataframe | |
Click the button below to expand the DataFrame view. | |
""" | |
) | |
return expand_button | |
# ============================================================================= | |
# 4. Column Selection Tips (as Markdown) | |
# ============================================================================= | |
def column_selection_tips(mo): | |
mo.md( | |
r""" | |
## Column Selection Tips | |
**Example 1: Select specific columns by name:** | |
```python | |
selected_columns_df = df.select(["column1", "column2"]) | |
``` | |
**Example 2: Select all columns except column 'a':** | |
```python | |
all_except_a_df = df.select(pl.exclude("a")) | |
``` | |
**Example 3: Select a range of columns (e.g., from the 2nd to the 4th column):** | |
```python | |
range_columns_df = df.select(pl.col(df.columns[1:4])) | |
``` | |
""" | |
) | |
return | |
# ============================================================================= | |
# Additional Polars I/O and DataFrame Examples (Markdown Cells) | |
# ============================================================================= | |
def example_1(mo): | |
mo.md( | |
r""" | |
### Example 1: Eagerly Read a Single Parquet File | |
```python | |
df = pl.read_parquet("hf://datasets/roneneldan/TinyStories/data/train-00000-of-00004-2d5a1467fff1081b.parquet") | |
``` | |
""" | |
) | |
return | |
def example_2(mo): | |
mo.md( | |
r""" | |
### Example 2: Read Multiple Parquet Files Using Globbing | |
```python | |
df = pl.read_parquet("hf://datasets/roneneldan/TinyStories/data/train-*.parquet") | |
``` | |
""" | |
) | |
return | |
def example_3(mo): | |
mo.md( | |
r""" | |
### Example 3: Lazily Scan Parquet Files with Recursive Globbing | |
```python | |
df_lazy = pl.scan_parquet("hf://datasets/cicero-im/processed_prompt1/**/*.parquet") | |
``` | |
""" | |
) | |
return | |
def example_4(mo): | |
mo.md( | |
r""" | |
### Example 4: Read a JSON File into a DataFrame | |
```python | |
df_json = pl.read_json("data/sample.json") | |
``` | |
""" | |
) | |
return | |
def example_5(mo): | |
mo.md( | |
r""" | |
### Example 5: Read JSON with a Specified Schema | |
```python | |
schema = {"name": pl.Utf8, "age": pl.Int64} | |
df_json = pl.read_json("data/sample.json", schema=schema) | |
``` | |
""" | |
) | |
return | |
def example_6(mo): | |
mo.md( | |
r""" | |
### Example 6: Write a DataFrame to NDJSON Format | |
```python | |
df = pl.DataFrame({"foo": [1, 2, 3], "bar": [6, 7, 8]}) | |
ndjson_str = df.write_ndjson() | |
print(ndjson_str) | |
``` | |
""" | |
) | |
return | |
def example_7(mo): | |
mo.md( | |
r""" | |
### Example 7: Get the Schema of a Parquet File Without Reading Data | |
```python | |
schema = pl.read_parquet_schema("hf://datasets/roneneldan/TinyStories/data/train-00000-of-00004-2d5a1467fff1081b.parquet") | |
print(schema) | |
``` | |
""" | |
) | |
return | |
def example_8(mo): | |
mo.md( | |
r""" | |
### Example 8: Scan Parquet Files with Hive Partitioning Enabled | |
```python | |
df = pl.scan_parquet("hf://datasets/myuser/my-dataset/data/**/*.parquet", hive_partitioning=True) | |
``` | |
""" | |
) | |
return | |
def example_9(mo): | |
mo.md( | |
r""" | |
### Example 9: Lazily Scan NDJSON Files Using Globbing | |
```python | |
df_lazy = pl.scan_ndjson("data/*.jsonl") | |
``` | |
""" | |
) | |
return | |
def example_10(mo): | |
mo.md( | |
r""" | |
### Example 10: Write a DataFrame to Partitioned Parquet Files | |
```python | |
df = pl.DataFrame({"date": ["2025-01-01", "2025-01-02"], "value": [100, 200]}) | |
df.write_parquet("output/", partition_by=["date"]) | |
``` | |
""" | |
) | |
return | |
def example_11(mo): | |
mo.md( | |
r""" | |
### Example 11: Read JSON with Custom Inference Length | |
```python | |
df = pl.read_json("data/large_text.json", infer_schema_length=500) | |
``` | |
""" | |
) | |
return | |
def example_12(mo): | |
mo.md( | |
r""" | |
### Example 12: Read JSON with Schema Overrides | |
```python | |
schema = {"id": pl.Int64, "text": pl.Utf8} | |
overrides = {"id": pl.Int32} | |
df = pl.read_json("data/large_text.json", schema=schema, schema_overrides=overrides) | |
``` | |
""" | |
) | |
return | |
def example_13(mo): | |
mo.md( | |
r""" | |
### Example 13: Write a DataFrame to NDJSON and Return as String | |
```python | |
df = pl.DataFrame({"foo": [1,2,3], "bar": [4,5,6]}) | |
ndjson_output = df.write_ndjson() | |
print(ndjson_output) | |
``` | |
""" | |
) | |
return | |
def example_14(mo): | |
mo.md( | |
r""" | |
### Example 14: Scan Parquet Files with Cloud Storage Options | |
```python | |
storage_options = {"token": os.environ.get("HF_TOKEN")} | |
df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet", storage_options=storage_options) | |
``` | |
""" | |
) | |
return | |
def example_15(mo): | |
mo.md( | |
r""" | |
### Example 15: Scan NDJSON Files with Cloud Storage Options | |
```python | |
storage_options = {"token": os.environ.get("HF_TOKEN")} | |
df_lazy = pl.scan_ndjson("hf://datasets/myuser/my-dataset/**/*.jsonl", storage_options=storage_options) | |
``` | |
""" | |
) | |
return | |
def example_16(mo): | |
mo.md( | |
r""" | |
### Example 16: Predicate Pushdown Example | |
```python | |
df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet") | |
# Only load rows where 'value' > 100 | |
df_filtered = df_lazy.filter(pl.col("value") > 100) | |
result = df_filtered.collect() | |
``` | |
""" | |
) | |
return | |
def example_17(mo): | |
mo.md( | |
r""" | |
### Example 17: Projection Pushdown Example | |
```python | |
df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet") | |
# Only select the 'text' and 'id' columns to reduce memory footprint | |
df_proj = df_lazy.select(["id", "text"]) | |
result = df_proj.collect() | |
``` | |
""" | |
) | |
return | |
def example_18(mo): | |
mo.md( | |
r""" | |
### Example 18: Collecting a Lazy DataFrame | |
```python | |
df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet") | |
# Perform lazy operations... | |
result = df_lazy.collect() | |
print(result) | |
``` | |
""" | |
) | |
return | |
def example_19(mo): | |
mo.md( | |
r""" | |
### Example 19: Filtering on a Large Text Column | |
```python | |
df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet") | |
# Filter rows where the 'text' column contains a long string pattern | |
df_filtered = df.filter(pl.col("text").str.contains("important keyword")) | |
print(df_filtered.head()) | |
``` | |
""" | |
) | |
return | |
def example_20(mo): | |
mo.md( | |
r""" | |
### Example 20: Using String Length on a Text Column | |
```python | |
df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet") | |
# Compute the length of text in the 'text' column | |
df = df.with_columns(text_length=pl.col("text").str.len()) | |
print(df.head()) | |
``` | |
""" | |
) | |
return | |
def example_21(mo): | |
mo.md( | |
r""" | |
### Example 21: Grouping by a Large Text Field | |
```python | |
df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet") | |
grouped = df.group_by("category").agg(pl.col("text").str.len().mean().alias("avg_text_length")) | |
print(grouped.collect()) | |
``` | |
""" | |
) | |
return | |
def example_22(mo): | |
mo.md( | |
r""" | |
### Example 22: Joining Two DataFrames on a Common Key | |
```python | |
df1 = pl.DataFrame({"id": [1,2,3], "text": ["A", "B", "C"]}) | |
df2 = pl.DataFrame({"id": [1,2,3], "value": [100, 200, 300]}) | |
joined = df1.join(df2, on="id") | |
print(joined) | |
``` | |
""" | |
) | |
return | |
def example_23(mo): | |
mo.md( | |
r""" | |
### Example 23: Using join_asof for Time-based Joins | |
```python | |
df1 = pl.DataFrame({ | |
"time": pl.date_range(low="2025-01-01", high="2025-01-02", interval="1h"), | |
"text": ["sample text"] * 25 | |
}) | |
df2 = pl.DataFrame({ | |
"time": pl.date_range(low="2025-01-01 00:30", high="2025-01-02", interval="1h"), | |
"value": list(range(25)) | |
}) | |
# Perform an asof join to match the nearest timestamp | |
joined = df1.sort("time").join_asof(df2.sort("time"), on="time") | |
print(joined) | |
``` | |
""" | |
) | |
return | |
def example_24(mo): | |
mo.md( | |
r""" | |
### Example 24: Reading a Parquet File with Low Memory Option | |
```python | |
df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet", low_memory=True) | |
print(df.head()) | |
``` | |
""" | |
) | |
return | |
def example_25(mo): | |
mo.md( | |
r""" | |
### Example 25: Scanning Parquet Files with a Parallel Strategy | |
```python | |
df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet", parallel="auto") | |
result = df_lazy.collect() | |
print(result) | |
``` | |
""" | |
) | |
return | |
def example_26(mo): | |
mo.md( | |
r""" | |
### Example 26: Reading a Large JSON File into a DataFrame | |
```python | |
df = pl.read_json("data/large_text.json", infer_schema_length=200) | |
print(df.head()) | |
``` | |
""" | |
) | |
return | |
def example_27(mo): | |
mo.md( | |
r""" | |
### Example 27: Using DataFrame.head() on a Large Text Dataset | |
```python | |
df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet") | |
print(df.head(10)) | |
``` | |
""" | |
) | |
return | |
def example_28(mo): | |
mo.md( | |
r""" | |
### Example 28: Using DataFrame.tail() on a Large Text Dataset | |
```python | |
df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet") | |
print(df.tail(10)) | |
``` | |
""" | |
) | |
return | |
def example_29(mo): | |
mo.md( | |
r""" | |
### Example 29: Scanning NDJSON Files with Rechunking | |
```python | |
df_lazy = pl.scan_ndjson("data/*.jsonl", rechunk=True) | |
result = df_lazy.collect() | |
print(result) | |
``` | |
""" | |
) | |
return | |
def example_30(mo): | |
mo.md( | |
r""" | |
### Example 30: Scanning Parquet Files with Allowing Missing Columns | |
```python | |
df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet", allow_missing_columns=True) | |
result = df_lazy.collect() | |
print(result) | |
``` | |
""" | |
) | |
return | |
# ============================================================================= | |
# End of Notebook | |
# ============================================================================= | |
def conclusion(mo): | |
mo.md( | |
r""" | |
# Conclusion | |
This notebook showcased: | |
- How to lazy-load a Hugging Face dataset using Polars with recursive globbing. | |
- How to preview and interactively expand the DataFrame. | |
- Over 30 examples covering various Polars I/O functions and DataFrame operations, | |
which are especially useful when working with large text data. | |
For more information, please refer to: | |
- [Polars Documentation](https://docs.pola.rs/) | |
- [Hugging Face Hub Documentation](https://huggingface.co./docs) | |
- [Marimo Notebook Documentation](https://marimo.io/) | |
Happy Data Exploring! | |
""" | |
) | |
return | |
if __name__ == "__main__": | |
app.run() |