Spaces:
Running
Running
Minor formatting and markdown changes
Browse files- polars/05_reactive_plots.py +45 -19
polars/05_reactive_plots.py
CHANGED
@@ -4,14 +4,14 @@
|
|
4 |
# "marimo",
|
5 |
# "numpy==2.2.3",
|
6 |
# "plotly[express]==6.0.0",
|
7 |
-
# "polars==1.
|
8 |
# "statsmodels==0.14.4",
|
9 |
# ]
|
10 |
# ///
|
11 |
|
12 |
import marimo
|
13 |
|
14 |
-
__generated_with = "0.
|
15 |
app = marimo.App(width="medium")
|
16 |
|
17 |
|
@@ -19,9 +19,16 @@ app = marimo.App(width="medium")
|
|
19 |
def _(mo):
|
20 |
mo.md(
|
21 |
"""
|
22 |
-
|
23 |
|
24 |
-
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
25 |
"""
|
26 |
)
|
27 |
return
|
@@ -47,11 +54,12 @@ def _(pl):
|
|
47 |
def _(mo):
|
48 |
mo.md(
|
49 |
"""
|
50 |
-
You should always take a look at the data you are working on before actually doing any operations on it - for data coming from sources such as HuggingFace or Kaggle you may want to look in their websites, then filter or do some transformations before downloading.
|
51 |
|
52 |
-
|
53 |
|
54 |
-
|
|
|
|
|
55 |
"""
|
56 |
)
|
57 |
return
|
@@ -82,12 +90,13 @@ def _(lz, pl):
|
|
82 |
def _(mo):
|
83 |
mo.md(
|
84 |
r"""
|
|
|
85 |
When you start exploring a dataset, some of the first things to do may include:
|
86 |
|
87 |
- investigating any values that seem weird
|
88 |
- verifying if there could be issues in the data
|
89 |
- checking for potential bugs in our pipelines
|
90 |
-
- ensuring you understand the data correctly,
|
91 |
|
92 |
For example, the "min" value for the duration column is zero, and the max is over an hour. Why is that?
|
93 |
"""
|
@@ -95,7 +104,7 @@ def _(mo):
|
|
95 |
return
|
96 |
|
97 |
|
98 |
-
@app.cell(disabled=True)
|
99 |
def _(df, pl):
|
100 |
# We *could* just filter some of the rows and look at them as a table, for example...
|
101 |
pl.concat([df.sort("duration_ms").head(5), df.sort("duration_ms", descending=True).head(5)])
|
@@ -107,7 +116,7 @@ def _(df, pl):
|
|
107 |
def _(mo):
|
108 |
mo.md(
|
109 |
r"""
|
110 |
-
For this Notebook we will be using [plotly](https://plotly.com/python), but Marimo also supports
|
111 |
|
112 |
Let's visualize it using a [bar chart](https://plotly.com/python/bar-charts/) and get a feel for which region makes sense to focus on for our analysis
|
113 |
"""
|
@@ -139,9 +148,9 @@ def _(mo):
|
|
139 |
return
|
140 |
|
141 |
|
142 |
-
@app.cell
|
143 |
def _(pl, plot):
|
144 |
-
#
|
145 |
pl.DataFrame(plot.value)
|
146 |
return
|
147 |
|
@@ -176,7 +185,7 @@ def _(df, pl, plot):
|
|
176 |
def _(mo):
|
177 |
mo.md(
|
178 |
r"""
|
179 |
-
Now that our data is clean, let's start coming up with and answering some questions about it. Some examples:
|
180 |
|
181 |
- Which tracks or artists are the most popular? (Both globally as well as for each genre)
|
182 |
- Which genres are the most popular? The loudest?
|
@@ -323,7 +332,7 @@ def _(filtered_duration, mo):
|
|
323 |
searchable=True,
|
324 |
label="Filter by Track Genre:",
|
325 |
)
|
326 |
-
x_axis, y_axis, color, alpha, include_trendline, filter_genre2
|
327 |
return (
|
328 |
alpha,
|
329 |
color,
|
@@ -392,6 +401,20 @@ def _(chart2, filtered_duration, mo, pl):
|
|
392 |
return active_columns, column_order, out
|
393 |
|
394 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
395 |
@app.cell(hide_code=True)
|
396 |
def _(mo):
|
397 |
mo.md("""# Appendix : Some other examples""")
|
@@ -457,7 +480,7 @@ def _(filter_artist, filter_track, filtered_duration, mo, pl):
|
|
457 |
.sort("match_score", descending=True)
|
458 |
)
|
459 |
|
460 |
-
mo.md("Filter a track based on its name or artist"), filter_artist, filter_track, filtered_artist_track
|
461 |
return filtered_artist_track, score_match_text
|
462 |
|
463 |
|
@@ -479,10 +502,13 @@ def _(filter_genre2, filtered_duration, mo, pl):
|
|
479 |
.len("count")
|
480 |
.collect()
|
481 |
)
|
482 |
-
(
|
483 |
-
|
484 |
-
|
485 |
-
|
|
|
|
|
|
|
486 |
)
|
487 |
return (artist_combinations,)
|
488 |
|
|
|
4 |
# "marimo",
|
5 |
# "numpy==2.2.3",
|
6 |
# "plotly[express]==6.0.0",
|
7 |
+
# "polars==1.27.1",
|
8 |
# "statsmodels==0.14.4",
|
9 |
# ]
|
10 |
# ///
|
11 |
|
12 |
import marimo
|
13 |
|
14 |
+
__generated_with = "0.12.8"
|
15 |
app = marimo.App(width="medium")
|
16 |
|
17 |
|
|
|
19 |
def _(mo):
|
20 |
mo.md(
|
21 |
"""
|
22 |
+
# Reactive Plots
|
23 |
|
24 |
+
_By [etrotta](https://github.com/etrotta)._
|
25 |
+
|
26 |
+
This tutorial covers Data Visualisation basics using marimo, [polars](https://docs.pola.rs/) and [plotly](https://plotly.com/python/plotly-express/).
|
27 |
+
It shows how to load data, explore and visualise it, then use User Interface elements (including the plots themselves) to filter and select data for more refined analysis.
|
28 |
+
|
29 |
+
We will be using a [Spotify Tracks dataset](https://huggingface.co/datasets/maharshipandya/spotify-tracks-dataset). Before you write any code yourself, I recommend taking some time to understand the data you're working with, from which columns are available to what are their possible values, as well as more abstract details such as the scope, coverage and intended uses of the dataset.
|
30 |
+
|
31 |
+
Note that this dataset does not contains data about ***all*** tracks, you can try using a larger dataset such as [bigdata-pw/Spotify](https://huggingface.co/datasets/bigdata-pw/Spotify), but I'm sticking with the smaller one to keep the notebook size managable for most users.
|
32 |
"""
|
33 |
)
|
34 |
return
|
|
|
54 |
def _(mo):
|
55 |
mo.md(
|
56 |
"""
|
|
|
57 |
|
58 |
+
You should always take a look at the data you are working on before actually doing any operations on it - for data coming from sources such as HuggingFace or Kaggle can preview it via their websites, and optionally filter or do some transformations before downloading.
|
59 |
|
60 |
+
The [Polars Lazy API](https://docs.pola.rs/user-guide/lazy/) allows for you define operations before loading the data, and polars will optimize the plan in order to avoid doing unnecessary operations or loading data we do not care about.
|
61 |
+
|
62 |
+
Let's say that looking at the dataset's preview in the Data Viewer, we decided we do not want the Unnamed column (which appears to be the row index), nor do we care about the original ID, and we only want non-explicit tracks.
|
63 |
"""
|
64 |
)
|
65 |
return
|
|
|
90 |
def _(mo):
|
91 |
mo.md(
|
92 |
r"""
|
93 |
+
|
94 |
When you start exploring a dataset, some of the first things to do may include:
|
95 |
|
96 |
- investigating any values that seem weird
|
97 |
- verifying if there could be issues in the data
|
98 |
- checking for potential bugs in our pipelines
|
99 |
+
- ensuring you understand the data correctly, including its relationships and edge cases
|
100 |
|
101 |
For example, the "min" value for the duration column is zero, and the max is over an hour. Why is that?
|
102 |
"""
|
|
|
104 |
return
|
105 |
|
106 |
|
107 |
+
@app.cell(disabled=True, hide_code=True)
|
108 |
def _(df, pl):
|
109 |
# We *could* just filter some of the rows and look at them as a table, for example...
|
110 |
pl.concat([df.sort("duration_ms").head(5), df.sort("duration_ms", descending=True).head(5)])
|
|
|
116 |
def _(mo):
|
117 |
mo.md(
|
118 |
r"""
|
119 |
+
For this Notebook we will be using [plotly](https://plotly.com/python), but Marimo also [supports other plotting libraries](https://docs.marimo.io/guides/working_with_data/plotting/).
|
120 |
|
121 |
Let's visualize it using a [bar chart](https://plotly.com/python/bar-charts/) and get a feel for which region makes sense to focus on for our analysis
|
122 |
"""
|
|
|
148 |
return
|
149 |
|
150 |
|
151 |
+
@app.cell(disabled=True, hide_code=True)
|
152 |
def _(pl, plot):
|
153 |
+
# If you want to see the selection itself
|
154 |
pl.DataFrame(plot.value)
|
155 |
return
|
156 |
|
|
|
185 |
def _(mo):
|
186 |
mo.md(
|
187 |
r"""
|
188 |
+
Now that our data is 'clean', let's start coming up with and answering some questions about it. Some examples:
|
189 |
|
190 |
- Which tracks or artists are the most popular? (Both globally as well as for each genre)
|
191 |
- Which genres are the most popular? The loudest?
|
|
|
332 |
searchable=True,
|
333 |
label="Filter by Track Genre:",
|
334 |
)
|
335 |
+
mo.vstack([x_axis, y_axis, color, alpha, include_trendline, filter_genre2])
|
336 |
return (
|
337 |
alpha,
|
338 |
color,
|
|
|
401 |
return active_columns, column_order, out
|
402 |
|
403 |
|
404 |
+
@app.cell
|
405 |
+
def _(mo):
|
406 |
+
mo.md(
|
407 |
+
r"""
|
408 |
+
Reviewing what we have covered in this Notebook:
|
409 |
+
|
410 |
+
- Understand the data you're working with first and foremost
|
411 |
+
- Creating plots can help you understand patterns, identify outliers and observe trends
|
412 |
+
- Thanks to marimo interactive UI elements we can explore multiple facets of the data without changing the code
|
413 |
+
"""
|
414 |
+
)
|
415 |
+
return
|
416 |
+
|
417 |
+
|
418 |
@app.cell(hide_code=True)
|
419 |
def _(mo):
|
420 |
mo.md("""# Appendix : Some other examples""")
|
|
|
480 |
.sort("match_score", descending=True)
|
481 |
)
|
482 |
|
483 |
+
mo.vstack([mo.md("Filter a track based on its name or artist"), filter_artist, filter_track, filtered_artist_track])
|
484 |
return filtered_artist_track, score_match_text
|
485 |
|
486 |
|
|
|
502 |
.len("count")
|
503 |
.collect()
|
504 |
)
|
505 |
+
mo.vstack(
|
506 |
+
[
|
507 |
+
mo.md("Check which artists collaborate with others most often (reuses the last genre filter)"),
|
508 |
+
filter_genre2,
|
509 |
+
artist_combinations.sort("count", descending=True),
|
510 |
+
],
|
511 |
+
align="center",
|
512 |
)
|
513 |
return (artist_combinations,)
|
514 |
|