Spaces:
Running
Running
Split or hide some of the data transformations
Browse files- polars/05_reactive_plots.py +110 -89
polars/05_reactive_plots.py
CHANGED
@@ -104,7 +104,7 @@ def _(mo):
|
|
104 |
return
|
105 |
|
106 |
|
107 |
-
@app.cell(disabled=True
|
108 |
def _(df, pl):
|
109 |
# We *could* just filter some of the rows and look at them as a table, for example...
|
110 |
pl.concat([df.sort("duration_ms").head(5), df.sort("duration_ms", descending=True).head(5)])
|
@@ -148,27 +148,19 @@ def _(mo):
|
|
148 |
return
|
149 |
|
150 |
|
151 |
-
@app.cell
|
152 |
def _(pl, plot):
|
153 |
-
#
|
154 |
pl.DataFrame(plot.value)
|
155 |
return
|
156 |
|
157 |
|
158 |
@app.cell
|
159 |
-
def _(df, pl, plot):
|
160 |
-
|
161 |
-
|
162 |
-
|
163 |
-
|
164 |
-
min_dur, max_dur = 120, 360
|
165 |
-
else:
|
166 |
-
# We can retrieve it and use it as a filter:
|
167 |
-
min_dur, max_dur = (
|
168 |
-
min(row["duration_seconds"] for row in plot.value),
|
169 |
-
max(row["duration_seconds"] for row in plot.value),
|
170 |
-
)
|
171 |
-
|
172 |
# Calculate how many we are keeping vs throwing away with the filter
|
173 |
duration_in_range = pl.col("duration_seconds").is_between(min_dur, max_dur)
|
174 |
print(
|
@@ -202,13 +194,15 @@ def _(mo):
|
|
202 |
return
|
203 |
|
204 |
|
205 |
-
@app.cell
|
206 |
def _(filter_genre, filtered_duration, mo, pl):
|
207 |
-
# Now, if you saw the Dataset description or looked closely at the Artists column you may notice there are some rows with multiple artists separated by ;;. We will have to separate each of these.
|
208 |
most_popular_artists = (
|
209 |
filtered_duration.lazy()
|
|
|
210 |
.with_columns(pl.col("artists").str.split(";"))
|
211 |
-
#
|
|
|
|
|
212 |
.filter(True if filter_genre.value is None else pl.col("track_genre").eq(filter_genre.value))
|
213 |
.explode("artists")
|
214 |
.group_by("artists")
|
@@ -228,33 +222,22 @@ def _(filter_genre, filtered_duration, mo, pl):
|
|
228 |
# And for good measure, see how many total tracks they have
|
229 |
pl.col("track_name").n_unique().alias("tracks_count"),
|
230 |
)
|
|
|
231 |
.collect()
|
232 |
)
|
233 |
-
mo.
|
234 |
-
|
235 |
-
|
236 |
-
|
237 |
-
|
238 |
-
|
239 |
-
|
240 |
-
|
241 |
-
|
242 |
-
|
243 |
-
|
244 |
-
@app.cell
|
245 |
-
def _(filtered_duration, mo):
|
246 |
-
# Recognize any of your favourite songs? Me neither. Let's try adding a filter by genre
|
247 |
-
# While developing, you can add things out of order then go back to old cells and edit them,
|
248 |
-
# it's up to you whenever to re-order them later or keep in whichever order visually makes the most sense to you.
|
249 |
-
filter_genre = mo.ui.dropdown(
|
250 |
-
options=filtered_duration["track_genre"].unique().sort().to_list(),
|
251 |
-
allow_select_none=True,
|
252 |
-
value=None,
|
253 |
-
searchable=True,
|
254 |
-
label="Filter by Track Genre:",
|
255 |
)
|
256 |
-
|
257 |
-
return (filter_genre,)
|
258 |
|
259 |
|
260 |
@app.cell(hide_code=True)
|
@@ -301,50 +284,7 @@ def _(mo):
|
|
301 |
return
|
302 |
|
303 |
|
304 |
-
@app.cell
|
305 |
-
def _(filtered_duration, mo):
|
306 |
-
# Let's start by making some comparisons, scatter plots are a nice way to get a feel for how dependent a variable is on another
|
307 |
-
options = [
|
308 |
-
"duration_seconds",
|
309 |
-
"popularity",
|
310 |
-
"danceability",
|
311 |
-
"energy",
|
312 |
-
"key",
|
313 |
-
"loudness",
|
314 |
-
"mode",
|
315 |
-
"speechiness",
|
316 |
-
"acousticness",
|
317 |
-
"instrumentalness",
|
318 |
-
"liveness",
|
319 |
-
"valence",
|
320 |
-
"tempo",
|
321 |
-
]
|
322 |
-
x_axis = mo.ui.dropdown(options, value="energy", label="X")
|
323 |
-
y_axis = mo.ui.dropdown(options, value="danceability", label="Y")
|
324 |
-
color = mo.ui.dropdown(options, value="loudness", allow_select_none=True, searchable=True, label="Color column")
|
325 |
-
alpha = mo.ui.slider(start=0.01, stop=1.0, step=0.01, value=0.1, label="Alpha", show_value=True)
|
326 |
-
include_trendline = mo.ui.checkbox(label="Trendline")
|
327 |
-
# We *could* reuse the same filter_genre as above, but it would cause marimo to rerun both the table and the graph whenever we change it
|
328 |
-
filter_genre2 = mo.ui.dropdown(
|
329 |
-
options=filtered_duration["track_genre"].unique().sort().to_list(),
|
330 |
-
allow_select_none=True,
|
331 |
-
value=None,
|
332 |
-
searchable=True,
|
333 |
-
label="Filter by Track Genre:",
|
334 |
-
)
|
335 |
-
mo.vstack([x_axis, y_axis, color, alpha, include_trendline, filter_genre2])
|
336 |
-
return (
|
337 |
-
alpha,
|
338 |
-
color,
|
339 |
-
filter_genre2,
|
340 |
-
include_trendline,
|
341 |
-
options,
|
342 |
-
x_axis,
|
343 |
-
y_axis,
|
344 |
-
)
|
345 |
-
|
346 |
-
|
347 |
-
@app.cell
|
348 |
def _(
|
349 |
alpha,
|
350 |
color,
|
@@ -367,9 +307,13 @@ def _(
|
|
367 |
opacity=alpha.value,
|
368 |
trendline="lowess" if include_trendline.value else None,
|
369 |
render_mode="webgl",
|
|
|
|
|
|
|
370 |
)
|
371 |
chart2 = mo.ui.plotly(fig2)
|
372 |
-
|
|
|
373 |
return chart2, fig2
|
374 |
|
375 |
|
@@ -401,7 +345,7 @@ def _(chart2, filtered_duration, mo, pl):
|
|
401 |
return active_columns, column_order, out
|
402 |
|
403 |
|
404 |
-
@app.cell
|
405 |
def _(mo):
|
406 |
mo.md(
|
407 |
r"""
|
@@ -415,6 +359,83 @@ def _(mo):
|
|
415 |
return
|
416 |
|
417 |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
418 |
@app.cell(hide_code=True)
|
419 |
def _(mo):
|
420 |
mo.md("""# Appendix : Some other examples""")
|
|
|
104 |
return
|
105 |
|
106 |
|
107 |
+
@app.cell(disabled=True)
|
108 |
def _(df, pl):
|
109 |
# We *could* just filter some of the rows and look at them as a table, for example...
|
110 |
pl.concat([df.sort("duration_ms").head(5), df.sort("duration_ms", descending=True).head(5)])
|
|
|
148 |
return
|
149 |
|
150 |
|
151 |
+
@app.cell
|
152 |
def _(pl, plot):
|
153 |
+
# The format of `plot.value` may vary depending on which kind of plot you are working with, let's see what we have for this case:
|
154 |
pl.DataFrame(plot.value)
|
155 |
return
|
156 |
|
157 |
|
158 |
@app.cell
|
159 |
+
def _(df, get_extremes, pl, plot):
|
160 |
+
# Now, we want to filter to only include tracks whose duration falls inside of our selection - we will need to first identify the extremes, then filter based on them
|
161 |
+
min_dur, max_dur = get_extremes(
|
162 |
+
plot.value, col="duration_seconds", defaults_if_missing=(120, 360)
|
163 |
+
) # Utlity function defined in the bottom of the Notebook
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
164 |
# Calculate how many we are keeping vs throwing away with the filter
|
165 |
duration_in_range = pl.col("duration_seconds").is_between(min_dur, max_dur)
|
166 |
print(
|
|
|
194 |
return
|
195 |
|
196 |
|
197 |
+
@app.cell(hide_code=True)
|
198 |
def _(filter_genre, filtered_duration, mo, pl):
|
|
|
199 |
most_popular_artists = (
|
200 |
filtered_duration.lazy()
|
201 |
+
# If you saw the Dataset description or looked closely at the Artists column you may notice there are some rows with multiple artists separated by ;;. We will have to separate each of these.
|
202 |
.with_columns(pl.col("artists").str.split(";"))
|
203 |
+
# Similarly to the utility function you saw before, filter_genre is also defined in a later cell.
|
204 |
+
# While developing, you can add things out of order then go back to old cells and edit them
|
205 |
+
# it's up to you whenever to put them in whichever order makes the most sense to you.
|
206 |
.filter(True if filter_genre.value is None else pl.col("track_genre").eq(filter_genre.value))
|
207 |
.explode("artists")
|
208 |
.group_by("artists")
|
|
|
222 |
# And for good measure, see how many total tracks they have
|
223 |
pl.col("track_name").n_unique().alias("tracks_count"),
|
224 |
)
|
225 |
+
.sort("popularity", descending=True)
|
226 |
.collect()
|
227 |
)
|
228 |
+
mo.vstack(
|
229 |
+
[
|
230 |
+
mo.md("Let's start by taking a look at the most popular artists"),
|
231 |
+
# Also adjust the formatting for displaying columns that include multiple values in the same line
|
232 |
+
most_popular_artists.with_columns(pl.col(pl.List(pl.String())).list.join("\n")),
|
233 |
+
mo.md("Recognize any of your favourite songs? Me neither. Let's try adding a filter by genre"),
|
234 |
+
filter_genre,
|
235 |
+
mo.md(
|
236 |
+
"(the code is omitted for brevity, but you can click the eye icon to see it)",
|
237 |
+
),
|
238 |
+
],
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
239 |
)
|
240 |
+
return (most_popular_artists,)
|
|
|
241 |
|
242 |
|
243 |
@app.cell(hide_code=True)
|
|
|
284 |
return
|
285 |
|
286 |
|
287 |
+
@app.cell(hide_code=True)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
288 |
def _(
|
289 |
alpha,
|
290 |
color,
|
|
|
307 |
opacity=alpha.value,
|
308 |
trendline="lowess" if include_trendline.value else None,
|
309 |
render_mode="webgl",
|
310 |
+
# strings on `hover` get fairly heavy when there are too many rows, but you can try using it after applying a few filters
|
311 |
+
# hover_name="track_name",
|
312 |
+
# hover_data=("artists", "album_name"),
|
313 |
)
|
314 |
chart2 = mo.ui.plotly(fig2)
|
315 |
+
|
316 |
+
mo.vstack([mo.hstack([x_axis, y_axis, color, alpha, include_trendline, filter_genre2]), chart2])
|
317 |
return chart2, fig2
|
318 |
|
319 |
|
|
|
345 |
return active_columns, column_order, out
|
346 |
|
347 |
|
348 |
+
@app.cell(hide_code=True)
|
349 |
def _(mo):
|
350 |
mo.md(
|
351 |
r"""
|
|
|
359 |
return
|
360 |
|
361 |
|
362 |
+
@app.cell(hide_code=True)
|
363 |
+
def _(mo):
|
364 |
+
mo.md(r"""# Utility Functions and UI Elements""")
|
365 |
+
return
|
366 |
+
|
367 |
+
|
368 |
+
@app.cell
|
369 |
+
def get_extremes():
|
370 |
+
def get_extremes(selection, col, defaults_if_missing):
|
371 |
+
"Get the minimum and maximum values for a given column within the selection"
|
372 |
+
if selection is None or len(selection) == 0:
|
373 |
+
print(
|
374 |
+
f"Could not find a selected region. Using default values {defaults_if_missing} instead, try clicking and dragging in the plot to change them."
|
375 |
+
)
|
376 |
+
return defaults_if_missing
|
377 |
+
else:
|
378 |
+
return (
|
379 |
+
min(row[col] for row in selection),
|
380 |
+
max(row[col] for row in selection),
|
381 |
+
)
|
382 |
+
return (get_extremes,)
|
383 |
+
|
384 |
+
|
385 |
+
@app.cell
|
386 |
+
def _(filtered_duration, mo):
|
387 |
+
filter_genre = mo.ui.dropdown(
|
388 |
+
options=filtered_duration["track_genre"].unique().sort().to_list(),
|
389 |
+
allow_select_none=True,
|
390 |
+
value=None,
|
391 |
+
searchable=True,
|
392 |
+
label="Filter by Track Genre:",
|
393 |
+
)
|
394 |
+
return (filter_genre,)
|
395 |
+
|
396 |
+
|
397 |
+
@app.cell
|
398 |
+
def _(filtered_duration, mo):
|
399 |
+
# Columns that make sense for the scatterplot and the corresponding UI elements
|
400 |
+
options = [
|
401 |
+
"duration_seconds",
|
402 |
+
"popularity",
|
403 |
+
"danceability",
|
404 |
+
"energy",
|
405 |
+
"key",
|
406 |
+
"loudness",
|
407 |
+
"mode",
|
408 |
+
"speechiness",
|
409 |
+
"acousticness",
|
410 |
+
"instrumentalness",
|
411 |
+
"liveness",
|
412 |
+
"valence",
|
413 |
+
"tempo",
|
414 |
+
]
|
415 |
+
x_axis = mo.ui.dropdown(options, value="energy", label="X")
|
416 |
+
y_axis = mo.ui.dropdown(options, value="danceability", label="Y")
|
417 |
+
color = mo.ui.dropdown(options, value="loudness", allow_select_none=True, searchable=True, label="Color column")
|
418 |
+
alpha = mo.ui.slider(start=0.01, stop=1.0, step=0.01, value=0.1, label="Alpha", show_value=True)
|
419 |
+
include_trendline = mo.ui.checkbox(label="Trendline")
|
420 |
+
# We *could* reuse the same filter_genre from above, but it would cause marimo to rerun both the table and the graph whenever we change it
|
421 |
+
filter_genre2 = mo.ui.dropdown(
|
422 |
+
options=filtered_duration["track_genre"].unique().sort().to_list(),
|
423 |
+
allow_select_none=True,
|
424 |
+
value=None,
|
425 |
+
searchable=True,
|
426 |
+
label="Filter by Track Genre:",
|
427 |
+
)
|
428 |
+
return (
|
429 |
+
alpha,
|
430 |
+
color,
|
431 |
+
filter_genre2,
|
432 |
+
include_trendline,
|
433 |
+
options,
|
434 |
+
x_axis,
|
435 |
+
y_axis,
|
436 |
+
)
|
437 |
+
|
438 |
+
|
439 |
@app.cell(hide_code=True)
|
440 |
def _(mo):
|
441 |
mo.md("""# Appendix : Some other examples""")
|