Spaces:

cicero-im
/

marimo

Sleeping

File size: 15,646 Bytes

import os
import polars as pl
import marimo

__generated_with = "0.10.15"
app = marimo.App(app_title="Polars & Hugging Face Data Exploration", css_file="../custom.css")

# =============================================================================
# Intro Cell
# =============================================================================
@app.cell
def introduction(mo):
    mo.md(
        r"""
        # Exploring a Hugging Face Dataset with Polars

        In this notebook we demonstrate how to:
         - **Lazy-load** a Hugging Face dataset (all Parquet files using a recursive globbing pattern).
         - **Preview** the loaded DataFrame with metadata.
         - **Interactively expand** the DataFrame view.
         - Explore over 30 additional examples of Polars I/O functions and DataFrame manipulations—especially for handling large text data.

        **Prerequisites:**
         - Install dependencies via:  
           ```bash
           pip install polars marimo
           ```
         - Make sure your Hugging Face API token is available in the `HF_TOKEN` environment variable.

        ![Hugging Face logo](https://huggingface.co./front/assets/huggingface_logo.svg)
        """
    )
    return

# =============================================================================
# Load HF_TOKEN from the environment
# =============================================================================
@app.cell
def load_token(mo):
    hf_token = os.environ.get("HF_TOKEN")
    mo.md(f"""
    **Hugging Face Token:** `{hf_token}`  
    *(Ensure that HF_TOKEN is set in your environment.)*
    """)
    return

# =============================================================================
# 1. Lazy-load the Dataset
# =============================================================================
@app.cell
def lazy_load_dataset(mo, pl):
    # Use a recursive globbing pattern to load all Parquet files from all subdirectories.
    dataset_url = "hf://datasets/cicero-im/processed_prompt1/**/*.parquet"
    
    @mo.lazy  # The mo.lazy decorator defers execution until the data is needed.
    def load_dataset():
        # Load all Parquet files matching the recursive pattern.
        df = pl.read_parquet(dataset_url)
        # --- Alternative for local JSONL files (uncomment if needed):
        # df = pl.read_ndjson("/local/path/to/*.jsonl")
        return df

    df = load_dataset()
    return df

# =============================================================================
# 2. Preview the DataFrame with Metadata
# =============================================================================
@app.cell
def preview_data(mo, lazy_load_dataset, pl):
    df = lazy_load_dataset  # LazyFrame returned by load_dataset
    preview = mo.ui.table(df.head(), metadata=True)
    mo.md(
        r"""
        ## Data Preview

        Below is a preview of the first few rows along with basic metadata.
        """
    )
    return preview

# =============================================================================
# 3. Expand the DataFrame for Better Visualization
# =============================================================================
@app.cell
def expand_view(mo, lazy_load_dataset, pl):
    df = lazy_load_dataset
    expand_button = mo.ui.button(label="Expand Dataframe")
    
    @expand_button.on_click
    def on_expand():
        mo.ui.table(df, width="100%", height="auto")
    
    mo.md(
        r"""
        ## Expand Dataframe

        Click the button below to expand the DataFrame view.
        """
    )
    return expand_button

# =============================================================================
# 4. Column Selection Tips (as Markdown)
# =============================================================================
@app.cell
def column_selection_tips(mo):
    mo.md(
        r"""
        ## Column Selection Tips

        **Example 1: Select specific columns by name:**
        ```python
        selected_columns_df = df.select(["column1", "column2"])
        ```

        **Example 2: Select all columns except column 'a':**
        ```python
        all_except_a_df = df.select(pl.exclude("a"))
        ```

        **Example 3: Select a range of columns (e.g., from the 2nd to the 4th column):**
        ```python
        range_columns_df = df.select(pl.col(df.columns[1:4]))
        ```
        """
    )
    return

# =============================================================================
# Additional Polars I/O and DataFrame Examples (Markdown Cells)
# =============================================================================

@app.cell
def example_1(mo):
    mo.md(
        r"""
        ### Example 1: Eagerly Read a Single Parquet File

        ```python
        df = pl.read_parquet("hf://datasets/roneneldan/TinyStories/data/train-00000-of-00004-2d5a1467fff1081b.parquet")
        ```
        """
    )
    return

@app.cell
def example_2(mo):
    mo.md(
        r"""
        ### Example 2: Read Multiple Parquet Files Using Globbing

        ```python
        df = pl.read_parquet("hf://datasets/roneneldan/TinyStories/data/train-*.parquet")
        ```
        """
    )
    return

@app.cell
def example_3(mo):
    mo.md(
        r"""
        ### Example 3: Lazily Scan Parquet Files with Recursive Globbing

        ```python
        df_lazy = pl.scan_parquet("hf://datasets/cicero-im/processed_prompt1/**/*.parquet")
        ```
        """
    )
    return

@app.cell
def example_4(mo):
    mo.md(
        r"""
        ### Example 4: Read a JSON File into a DataFrame

        ```python
        df_json = pl.read_json("data/sample.json")
        ```
        """
    )
    return

@app.cell
def example_5(mo):
    mo.md(
        r"""
        ### Example 5: Read JSON with a Specified Schema

        ```python
        schema = {"name": pl.Utf8, "age": pl.Int64}
        df_json = pl.read_json("data/sample.json", schema=schema)
        ```
        """
    )
    return

@app.cell
def example_6(mo):
    mo.md(
        r"""
        ### Example 6: Write a DataFrame to NDJSON Format

        ```python
        df = pl.DataFrame({"foo": [1, 2, 3], "bar": [6, 7, 8]})
        ndjson_str = df.write_ndjson()
        print(ndjson_str)
        ```
        """
    )
    return

@app.cell
def example_7(mo):
    mo.md(
        r"""
        ### Example 7: Get the Schema of a Parquet File Without Reading Data

        ```python
        schema = pl.read_parquet_schema("hf://datasets/roneneldan/TinyStories/data/train-00000-of-00004-2d5a1467fff1081b.parquet")
        print(schema)
        ```
        """
    )
    return

@app.cell
def example_8(mo):
    mo.md(
        r"""
        ### Example 8: Scan Parquet Files with Hive Partitioning Enabled

        ```python
        df = pl.scan_parquet("hf://datasets/myuser/my-dataset/data/**/*.parquet", hive_partitioning=True)
        ```
        """
    )
    return

@app.cell
def example_9(mo):
    mo.md(
        r"""
        ### Example 9: Lazily Scan NDJSON Files Using Globbing

        ```python
        df_lazy = pl.scan_ndjson("data/*.jsonl")
        ```
        """
    )
    return

@app.cell
def example_10(mo):
    mo.md(
        r"""
        ### Example 10: Write a DataFrame to Partitioned Parquet Files

        ```python
        df = pl.DataFrame({"date": ["2025-01-01", "2025-01-02"], "value": [100, 200]})
        df.write_parquet("output/", partition_by=["date"])
        ```
        """
    )
    return

@app.cell
def example_11(mo):
    mo.md(
        r"""
        ### Example 11: Read JSON with Custom Inference Length

        ```python
        df = pl.read_json("data/large_text.json", infer_schema_length=500)
        ```
        """
    )
    return

@app.cell
def example_12(mo):
    mo.md(
        r"""
        ### Example 12: Read JSON with Schema Overrides

        ```python
        schema = {"id": pl.Int64, "text": pl.Utf8}
        overrides = {"id": pl.Int32}
        df = pl.read_json("data/large_text.json", schema=schema, schema_overrides=overrides)
        ```
        """
    )
    return

@app.cell
def example_13(mo):
    mo.md(
        r"""
        ### Example 13: Write a DataFrame to NDJSON and Return as String

        ```python
        df = pl.DataFrame({"foo": [1,2,3], "bar": [4,5,6]})
        ndjson_output = df.write_ndjson()
        print(ndjson_output)
        ```
        """
    )
    return

@app.cell
def example_14(mo):
    mo.md(
        r"""
        ### Example 14: Scan Parquet Files with Cloud Storage Options

        ```python
        storage_options = {"token": os.environ.get("HF_TOKEN")}
        df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet", storage_options=storage_options)
        ```
        """
    )
    return

@app.cell
def example_15(mo):
    mo.md(
        r"""
        ### Example 15: Scan NDJSON Files with Cloud Storage Options

        ```python
        storage_options = {"token": os.environ.get("HF_TOKEN")}
        df_lazy = pl.scan_ndjson("hf://datasets/myuser/my-dataset/**/*.jsonl", storage_options=storage_options)
        ```
        """
    )
    return

@app.cell
def example_16(mo):
    mo.md(
        r"""
        ### Example 16: Predicate Pushdown Example

        ```python
        df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
        # Only load rows where 'value' > 100
        df_filtered = df_lazy.filter(pl.col("value") > 100)
        result = df_filtered.collect()
        ```
        """
    )
    return

@app.cell
def example_17(mo):
    mo.md(
        r"""
        ### Example 17: Projection Pushdown Example

        ```python
        df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
        # Only select the 'text' and 'id' columns to reduce memory footprint
        df_proj = df_lazy.select(["id", "text"])
        result = df_proj.collect()
        ```
        """
    )
    return

@app.cell
def example_18(mo):
    mo.md(
        r"""
        ### Example 18: Collecting a Lazy DataFrame

        ```python
        df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
        # Perform lazy operations...
        result = df_lazy.collect()
        print(result)
        ```
        """
    )
    return

@app.cell
def example_19(mo):
    mo.md(
        r"""
        ### Example 19: Filtering on a Large Text Column

        ```python
        df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
        # Filter rows where the 'text' column contains a long string pattern
        df_filtered = df.filter(pl.col("text").str.contains("important keyword"))
        print(df_filtered.head())
        ```
        """
    )
    return

@app.cell
def example_20(mo):
    mo.md(
        r"""
        ### Example 20: Using String Length on a Text Column

        ```python
        df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
        # Compute the length of text in the 'text' column
        df = df.with_columns(text_length=pl.col("text").str.len())
        print(df.head())
        ```
        """
    )
    return

@app.cell
def example_21(mo):
    mo.md(
        r"""
        ### Example 21: Grouping by a Large Text Field

        ```python
        df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
        grouped = df.group_by("category").agg(pl.col("text").str.len().mean().alias("avg_text_length"))
        print(grouped.collect())
        ```
        """
    )
    return

@app.cell
def example_22(mo):
    mo.md(
        r"""
        ### Example 22: Joining Two DataFrames on a Common Key

        ```python
        df1 = pl.DataFrame({"id": [1,2,3], "text": ["A", "B", "C"]})
        df2 = pl.DataFrame({"id": [1,2,3], "value": [100, 200, 300]})
        joined = df1.join(df2, on="id")
        print(joined)
        ```
        """
    )
    return

@app.cell
def example_23(mo):
    mo.md(
        r"""
        ### Example 23: Using join_asof for Time-based Joins

        ```python
        df1 = pl.DataFrame({
            "time": pl.date_range(low="2025-01-01", high="2025-01-02", interval="1h"),
            "text": ["sample text"] * 25
        })
        df2 = pl.DataFrame({
            "time": pl.date_range(low="2025-01-01 00:30", high="2025-01-02", interval="1h"),
            "value": list(range(25))
        })
        # Perform an asof join to match the nearest timestamp
        joined = df1.sort("time").join_asof(df2.sort("time"), on="time")
        print(joined)
        ```
        """
    )
    return

@app.cell
def example_24(mo):
    mo.md(
        r"""
        ### Example 24: Reading a Parquet File with Low Memory Option

        ```python
        df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet", low_memory=True)
        print(df.head())
        ```
        """
    )
    return

@app.cell
def example_25(mo):
    mo.md(
        r"""
        ### Example 25: Scanning Parquet Files with a Parallel Strategy

        ```python
        df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet", parallel="auto")
        result = df_lazy.collect()
        print(result)
        ```
        """
    )
    return

@app.cell
def example_26(mo):
    mo.md(
        r"""
        ### Example 26: Reading a Large JSON File into a DataFrame

        ```python
        df = pl.read_json("data/large_text.json", infer_schema_length=200)
        print(df.head())
        ```
        """
    )
    return

@app.cell
def example_27(mo):
    mo.md(
        r"""
        ### Example 27: Using DataFrame.head() on a Large Text Dataset

        ```python
        df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
        print(df.head(10))
        ```
        """
    )
    return

@app.cell
def example_28(mo):
    mo.md(
        r"""
        ### Example 28: Using DataFrame.tail() on a Large Text Dataset

        ```python
        df = pl.read_parquet("hf://datasets/myuser/my-dataset/**/*.parquet")
        print(df.tail(10))
        ```
        """
    )
    return

@app.cell
def example_29(mo):
    mo.md(
        r"""
        ### Example 29: Scanning NDJSON Files with Rechunking

        ```python
        df_lazy = pl.scan_ndjson("data/*.jsonl", rechunk=True)
        result = df_lazy.collect()
        print(result)
        ```
        """
    )
    return

@app.cell
def example_30(mo):
    mo.md(
        r"""
        ### Example 30: Scanning Parquet Files with Allowing Missing Columns

        ```python
        df_lazy = pl.scan_parquet("hf://datasets/myuser/my-dataset/**/*.parquet", allow_missing_columns=True)
        result = df_lazy.collect()
        print(result)
        ```
        """
    )
    return

# =============================================================================
# End of Notebook
# =============================================================================
@app.cell
def conclusion(mo):
    mo.md(
        r"""
        # Conclusion

        This notebook showcased:
         - How to lazy-load a Hugging Face dataset using Polars with recursive globbing.
         - How to preview and interactively expand the DataFrame.
         - Over 30 examples covering various Polars I/O functions and DataFrame operations,
           which are especially useful when working with large text data.

        For more information, please refer to:
         - [Polars Documentation](https://docs.pola.rs/)
         - [Hugging Face Hub Documentation](https://huggingface.co./docs)
         - [Marimo Notebook Documentation](https://marimo.io/)

        Happy Data Exploring!
        """
    )
    return

if __name__ == "__main__":
    app.run()