marimo / app.py
arthrod's picture
Update app.py
d25a3d8 verified
import os
import polars as pl
import marimo
__generated_with = "0.10.15"
app = marimo.App(app_title="Polars & Hugging Face Data Exploration", css_file="../custom.css")
# =============================================================================
# Intro Cell
# =============================================================================
@app.cell
def introduction(mo):
mo.md(
r"""
# Exploring a Hugging Face Dataset with Polars
In this notebook we demonstrate how to:
- **Lazy-load** a Hugging Face dataset (all Parquet files using a recursive globbing pattern).
- **Preview** the loaded DataFrame with metadata.
- **Interactively expand** the DataFrame view.
- Explore over 30 additional examples of Polars I/O functions and DataFrame manipulations—especially for handling large text data.
**Prerequisites:**
- Install dependencies via:
```bash
pip install polars marimo
```
- Make sure your Hugging Face API token is available in the `HF_TOKEN` environment variable.
![Hugging Face logo](https://huggingface.co./front/assets/huggingface_logo.svg)
"""
)
return
# =============================================================================
# Load HF_TOKEN from the environment
# =============================================================================
@app.cell
def load_token(mo):
hf_token = os.environ.get("HF_TOKEN")
mo.md(f"""
**Hugging Face Token:** `{hf_token}`
*(Ensure that HF_TOKEN is set in your environment.)*
""")
return
# =============================================================================
# 1. Lazy-load the Dataset
# =============================================================================
@app.cell
def lazy_load_dataset(mo, pl):
# Use a recursive globbing pattern to load all Parquet files from all subdirectories.
dataset_url = "hf://datasets/cicero-im/processed_prompt1/**/*.parquet"
@mo.lazy # The mo.lazy decorator defers execution until the data is needed.
def load_dataset():
# Load all Parquet files matching the recursive pattern.
df = pl.read_parquet(dataset_url)
# --- Alternative for local JSONL files (uncomment if needed):
# df = pl.read_ndjson("/local/path/to/*.jsonl")
return df
df = load_dataset()
return df
# =============================================================================
# 2. Preview the DataFrame with Metadata
# =============================================================================
@app.cell
def preview_data(mo, lazy_load_dataset, pl):
df = lazy_load_dataset # LazyFrame returned by load_dataset
preview = mo.ui.table(df.head(), metadata=True)
mo.md(
r"""
## Data Preview
Below is a preview of the first few rows along with basic metadata.
"""
)
return preview
# =============================================================================
# 3. Expand the DataFrame for Better Visualization
# =============================================================================
@app.cell
def expand_view(mo, lazy_load_dataset, pl):
df = lazy_load_dataset
expand_button = mo.ui.button(label="Expand Dataframe")
@expand_button.on_click
def on_expand():
mo.ui.table(df, width="100%", height="auto")
mo.md(
r"""
## Expand Dataframe
Click the button below to expand the DataFrame view.
"""
)
return expand_button
# =============================================================================
# 4. Column Selection Tips (as Markdown)
# =============================================================================
@app.cell
def column_selection_tips(mo):
mo.md(
r"""
## Column Selection Tips
**Example 1: Select specific columns by name:**
```python
selected_columns_df = df.select(["column1", "column2"])
```
**Example 2: Select all columns except column 'a':**
```python
all_except_a_df = df.select(pl.exclude("a"))
```
**Example 3: Select a range of columns (e.g., from the 2nd to the 4th column):**
```python
range_columns_df = df.select(pl.col(df.columns[1:4]))
```
"""
)
return
# =============================================================================
# Additional Polars I/O and DataFrame Examples (Markdown Cells)
# =============================================================================
@app.cell
def example_1(mo):
mo.md(
r"""
### Example 1: Eagerly Read a Single Parquet File
```python
df = pl.read_parquet("hf://datasets/roneneldan/TinyStories/data/train-00000-of-00004-2d5a1467fff1081b.parquet")
```
"""
)
return
@app.cell
def example_2(mo):
mo.md(
r"""
### Example 2: Read Multiple Parquet Files Using Globbing
```python
df = pl.read_parquet("hf://datasets/roneneldan/TinyStories/data/train-*.parquet")
```
"""
)
return
@app.cell
def example_3(mo):
mo.md(
r"""
### Example 3: Lazily Scan Parquet Files with Recursive Globbing
```python
df_lazy = pl.scan_parquet("hf://datasets/cicero-im/processed_prompt1/**/*.parquet")
```
"""
)
return
@app.cell
def example_4(mo):
mo.md(
r"""
### Example 4: Read a JSON File into a DataFrame
```python
df_json = pl.read_json("data/sample.json")
```
"""
)
return
@app.cell
def example_5(mo):
mo.md(
r"""
### Example 5: Read JSON with a Specified Schema
```python
schema = {"name": pl.Utf8, "age": pl.Int64}
df_json = pl.read_json("data/sample.json", schema=schema)
```
"""
)
return
@app.cell
def example_6(mo):
mo.md(
r"""
### Example 6: Write a DataFrame to NDJSON Format
```python
df = pl.DataFrame({"foo": [1, 2, 3], "bar": [6, 7, 8]})
ndjson_str = df.write_ndjson()
print(ndjson_str)
```
"""
)
return
@app.cell
def example_7(mo):
mo.md(
r"""
### Example 7: Get the Schema of a Parquet File Without Reading Data
```python
schema = pl.read_parquet_schema("hf://datasets/roneneldan/TinyStories/data/train-00000-of-00004-2d5a1467fff1081b.parquet")
print(schema)
```
"""
)
return
@app.cell
def example_8(mo):
mo.md(
r"""
### Example 8: Scan Parquet Files with Hive Partitioning Enabled
```python
df = pl.scan_parquet("hf://datasets/myuser/my-dataset/data/**/*.parquet", hive_partitioning=True)
```
"""
)
return
@app.cell
def example_9(mo):
mo.md(
r"""
### Example 9: Lazily Scan NDJSON Files Using Globbing
```python
df_lazy = pl.scan_ndjson("data/*.jsonl")
```
"""
)
return
@app.cell
def example_10(mo):
mo.md(
r"""
### Example 10: Write a DataFrame to Partitioned Parquet Files
```python
df = pl.DataFrame({"date": ["2025-01-01", "2025-01-02"], "value": [100, 200]})
df.write_parquet("output/", partition_by=["date"])
```
"""
)
return
@app.cell
def example_11(mo):
mo.md(
r"""
### Example 11: Read JSON with Custom Inference Length
```python
df = pl.read_json("data/large_text.json", infer_schema_length=500)
```
"""
)
return
@app.cell
def example_12(mo):
mo.md(
r"""
### Example 12: Read JSON with Schema Overrides
```python
schema = {"id": pl.Int64, "text": pl.Utf8}
overrides = {"id": pl.Int32}
df = pl.read_json("data/large_text.json", schema=schema, schema_overrides=overrides)
```
"""
)
return
@app.cell
def example_13(mo):
mo.md(
r"""
### Example 13: Write a DataFrame to NDJSON and Return as String
```python
df = pl.DataFrame({"foo": [1,2,3], "bar": [4,5,6]})
ndjson_output = df.write_ndjson()
print(ndjson_output)
```
"""
)
return
@app.cell
def example_14(mo):
mo.md(
r"""
### Example 14: Scan Parquet Files with Cloud Storage Options
```python
storage_options = {"token": os.environ.get("HF_TOKEN")}
df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet", storage_options=storage_options)
```
"""
)
return
@app.cell
def example_15(mo):
mo.md(
r"""
### Example 15: Scan NDJSON Files with Cloud Storage Options
```python
storage_options = {"token": os.environ.get("HF_TOKEN")}
df_lazy = pl.scan_ndjson("hf://datasets/myuser/my-dataset/**/*.jsonl", storage_options=storage_options)
```
"""
)
return
@app.cell
def example_16(mo):
mo.md(
r"""
### Example 16: Predicate Pushdown Example
```python
df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
# Only load rows where 'value' > 100
df_filtered = df_lazy.filter(pl.col("value") > 100)
result = df_filtered.collect()
```
"""
)
return
@app.cell
def example_17(mo):
mo.md(
r"""
### Example 17: Projection Pushdown Example
```python
df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
# Only select the 'text' and 'id' columns to reduce memory footprint
df_proj = df_lazy.select(["id", "text"])
result = df_proj.collect()
```
"""
)
return
@app.cell
def example_18(mo):
mo.md(
r"""
### Example 18: Collecting a Lazy DataFrame
```python
df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
# Perform lazy operations...
result = df_lazy.collect()
print(result)
```
"""
)
return
@app.cell
def example_19(mo):
mo.md(
r"""
### Example 19: Filtering on a Large Text Column
```python
df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
# Filter rows where the 'text' column contains a long string pattern
df_filtered = df.filter(pl.col("text").str.contains("important keyword"))
print(df_filtered.head())
```
"""
)
return
@app.cell
def example_20(mo):
mo.md(
r"""
### Example 20: Using String Length on a Text Column
```python
df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
# Compute the length of text in the 'text' column
df = df.with_columns(text_length=pl.col("text").str.len())
print(df.head())
```
"""
)
return
@app.cell
def example_21(mo):
mo.md(
r"""
### Example 21: Grouping by a Large Text Field
```python
df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
grouped = df.group_by("category").agg(pl.col("text").str.len().mean().alias("avg_text_length"))
print(grouped.collect())
```
"""
)
return
@app.cell
def example_22(mo):
mo.md(
r"""
### Example 22: Joining Two DataFrames on a Common Key
```python
df1 = pl.DataFrame({"id": [1,2,3], "text": ["A", "B", "C"]})
df2 = pl.DataFrame({"id": [1,2,3], "value": [100, 200, 300]})
joined = df1.join(df2, on="id")
print(joined)
```
"""
)
return
@app.cell
def example_23(mo):
mo.md(
r"""
### Example 23: Using join_asof for Time-based Joins
```python
df1 = pl.DataFrame({
"time": pl.date_range(low="2025-01-01", high="2025-01-02", interval="1h"),
"text": ["sample text"] * 25
})
df2 = pl.DataFrame({
"time": pl.date_range(low="2025-01-01 00:30", high="2025-01-02", interval="1h"),
"value": list(range(25))
})
# Perform an asof join to match the nearest timestamp
joined = df1.sort("time").join_asof(df2.sort("time"), on="time")
print(joined)
```
"""
)
return
@app.cell
def example_24(mo):
mo.md(
r"""
### Example 24: Reading a Parquet File with Low Memory Option
```python
df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet", low_memory=True)
print(df.head())
```
"""
)
return
@app.cell
def example_25(mo):
mo.md(
r"""
### Example 25: Scanning Parquet Files with a Parallel Strategy
```python
df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet", parallel="auto")
result = df_lazy.collect()
print(result)
```
"""
)
return
@app.cell
def example_26(mo):
mo.md(
r"""
### Example 26: Reading a Large JSON File into a DataFrame
```python
df = pl.read_json("data/large_text.json", infer_schema_length=200)
print(df.head())
```
"""
)
return
@app.cell
def example_27(mo):
mo.md(
r"""
### Example 27: Using DataFrame.head() on a Large Text Dataset
```python
df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
print(df.head(10))
```
"""
)
return
@app.cell
def example_28(mo):
mo.md(
r"""
### Example 28: Using DataFrame.tail() on a Large Text Dataset
```python
df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
print(df.tail(10))
```
"""
)
return
@app.cell
def example_29(mo):
mo.md(
r"""
### Example 29: Scanning NDJSON Files with Rechunking
```python
df_lazy = pl.scan_ndjson("data/*.jsonl", rechunk=True)
result = df_lazy.collect()
print(result)
```
"""
)
return
@app.cell
def example_30(mo):
mo.md(
r"""
### Example 30: Scanning Parquet Files with Allowing Missing Columns
```python
df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet", allow_missing_columns=True)
result = df_lazy.collect()
print(result)
```
"""
)
return
# =============================================================================
# End of Notebook
# =============================================================================
@app.cell
def conclusion(mo):
mo.md(
r"""
# Conclusion
This notebook showcased:
- How to lazy-load a Hugging Face dataset using Polars with recursive globbing.
- How to preview and interactively expand the DataFrame.
- Over 30 examples covering various Polars I/O functions and DataFrame operations,
which are especially useful when working with large text data.
For more information, please refer to:
- [Polars Documentation](https://docs.pola.rs/)
- [Hugging Face Hub Documentation](https://huggingface.co./docs)
- [Marimo Notebook Documentation](https://marimo.io/)
Happy Data Exploring!
"""
)
return
if __name__ == "__main__":
app.run()