etrotta commited on
Commit
3667af3
·
1 Parent(s): e5eb18d

Split or hide some of the data transformations

Browse files
Files changed (1) hide show
  1. polars/05_reactive_plots.py +110 -89
polars/05_reactive_plots.py CHANGED
@@ -104,7 +104,7 @@ def _(mo):
104
  return
105
 
106
 
107
- @app.cell(disabled=True, hide_code=True)
108
  def _(df, pl):
109
  # We *could* just filter some of the rows and look at them as a table, for example...
110
  pl.concat([df.sort("duration_ms").head(5), df.sort("duration_ms", descending=True).head(5)])
@@ -148,27 +148,19 @@ def _(mo):
148
  return
149
 
150
 
151
- @app.cell(disabled=True, hide_code=True)
152
  def _(pl, plot):
153
- # If you want to see the selection itself
154
  pl.DataFrame(plot.value)
155
  return
156
 
157
 
158
  @app.cell
159
- def _(df, pl, plot):
160
- if plot.value is None or len(plot.value) == 0:
161
- print(
162
- "Could not find a selected region. Using default values instead, try clicking and dragging in the above plot to change them."
163
- )
164
- min_dur, max_dur = 120, 360
165
- else:
166
- # We can retrieve it and use it as a filter:
167
- min_dur, max_dur = (
168
- min(row["duration_seconds"] for row in plot.value),
169
- max(row["duration_seconds"] for row in plot.value),
170
- )
171
-
172
  # Calculate how many we are keeping vs throwing away with the filter
173
  duration_in_range = pl.col("duration_seconds").is_between(min_dur, max_dur)
174
  print(
@@ -202,13 +194,15 @@ def _(mo):
202
  return
203
 
204
 
205
- @app.cell
206
  def _(filter_genre, filtered_duration, mo, pl):
207
- # Now, if you saw the Dataset description or looked closely at the Artists column you may notice there are some rows with multiple artists separated by ;;. We will have to separate each of these.
208
  most_popular_artists = (
209
  filtered_duration.lazy()
 
210
  .with_columns(pl.col("artists").str.split(";"))
211
- # Spoiler for a future cell! Remember that in marimo you can do things 'out of order'
 
 
212
  .filter(True if filter_genre.value is None else pl.col("track_genre").eq(filter_genre.value))
213
  .explode("artists")
214
  .group_by("artists")
@@ -228,33 +222,22 @@ def _(filter_genre, filtered_duration, mo, pl):
228
  # And for good measure, see how many total tracks they have
229
  pl.col("track_name").n_unique().alias("tracks_count"),
230
  )
 
231
  .collect()
232
  )
233
- mo.md("Let's start with the Most popular artists")
234
- return (most_popular_artists,)
235
-
236
-
237
- @app.cell
238
- def _(most_popular_artists, pl):
239
- # Just adjust the formatting for displaying columns that include multiple values in the same line
240
- most_popular_artists.with_columns(pl.col(pl.List(pl.String())).list.join("\n")).sort("popularity", descending=True)
241
- return
242
-
243
-
244
- @app.cell
245
- def _(filtered_duration, mo):
246
- # Recognize any of your favourite songs? Me neither. Let's try adding a filter by genre
247
- # While developing, you can add things out of order then go back to old cells and edit them,
248
- # it's up to you whenever to re-order them later or keep in whichever order visually makes the most sense to you.
249
- filter_genre = mo.ui.dropdown(
250
- options=filtered_duration["track_genre"].unique().sort().to_list(),
251
- allow_select_none=True,
252
- value=None,
253
- searchable=True,
254
- label="Filter by Track Genre:",
255
  )
256
- filter_genre
257
- return (filter_genre,)
258
 
259
 
260
  @app.cell(hide_code=True)
@@ -301,50 +284,7 @@ def _(mo):
301
  return
302
 
303
 
304
- @app.cell
305
- def _(filtered_duration, mo):
306
- # Let's start by making some comparisons, scatter plots are a nice way to get a feel for how dependent a variable is on another
307
- options = [
308
- "duration_seconds",
309
- "popularity",
310
- "danceability",
311
- "energy",
312
- "key",
313
- "loudness",
314
- "mode",
315
- "speechiness",
316
- "acousticness",
317
- "instrumentalness",
318
- "liveness",
319
- "valence",
320
- "tempo",
321
- ]
322
- x_axis = mo.ui.dropdown(options, value="energy", label="X")
323
- y_axis = mo.ui.dropdown(options, value="danceability", label="Y")
324
- color = mo.ui.dropdown(options, value="loudness", allow_select_none=True, searchable=True, label="Color column")
325
- alpha = mo.ui.slider(start=0.01, stop=1.0, step=0.01, value=0.1, label="Alpha", show_value=True)
326
- include_trendline = mo.ui.checkbox(label="Trendline")
327
- # We *could* reuse the same filter_genre as above, but it would cause marimo to rerun both the table and the graph whenever we change it
328
- filter_genre2 = mo.ui.dropdown(
329
- options=filtered_duration["track_genre"].unique().sort().to_list(),
330
- allow_select_none=True,
331
- value=None,
332
- searchable=True,
333
- label="Filter by Track Genre:",
334
- )
335
- mo.vstack([x_axis, y_axis, color, alpha, include_trendline, filter_genre2])
336
- return (
337
- alpha,
338
- color,
339
- filter_genre2,
340
- include_trendline,
341
- options,
342
- x_axis,
343
- y_axis,
344
- )
345
-
346
-
347
- @app.cell
348
  def _(
349
  alpha,
350
  color,
@@ -367,9 +307,13 @@ def _(
367
  opacity=alpha.value,
368
  trendline="lowess" if include_trendline.value else None,
369
  render_mode="webgl",
 
 
 
370
  )
371
  chart2 = mo.ui.plotly(fig2)
372
- chart2
 
373
  return chart2, fig2
374
 
375
 
@@ -401,7 +345,7 @@ def _(chart2, filtered_duration, mo, pl):
401
  return active_columns, column_order, out
402
 
403
 
404
- @app.cell
405
  def _(mo):
406
  mo.md(
407
  r"""
@@ -415,6 +359,83 @@ def _(mo):
415
  return
416
 
417
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
418
  @app.cell(hide_code=True)
419
  def _(mo):
420
  mo.md("""# Appendix : Some other examples""")
 
104
  return
105
 
106
 
107
+ @app.cell(disabled=True)
108
  def _(df, pl):
109
  # We *could* just filter some of the rows and look at them as a table, for example...
110
  pl.concat([df.sort("duration_ms").head(5), df.sort("duration_ms", descending=True).head(5)])
 
148
  return
149
 
150
 
151
+ @app.cell
152
  def _(pl, plot):
153
+ # The format of `plot.value` may vary depending on which kind of plot you are working with, let's see what we have for this case:
154
  pl.DataFrame(plot.value)
155
  return
156
 
157
 
158
  @app.cell
159
+ def _(df, get_extremes, pl, plot):
160
+ # Now, we want to filter to only include tracks whose duration falls inside of our selection - we will need to first identify the extremes, then filter based on them
161
+ min_dur, max_dur = get_extremes(
162
+ plot.value, col="duration_seconds", defaults_if_missing=(120, 360)
163
+ ) # Utlity function defined in the bottom of the Notebook
 
 
 
 
 
 
 
 
164
  # Calculate how many we are keeping vs throwing away with the filter
165
  duration_in_range = pl.col("duration_seconds").is_between(min_dur, max_dur)
166
  print(
 
194
  return
195
 
196
 
197
+ @app.cell(hide_code=True)
198
  def _(filter_genre, filtered_duration, mo, pl):
 
199
  most_popular_artists = (
200
  filtered_duration.lazy()
201
+ # If you saw the Dataset description or looked closely at the Artists column you may notice there are some rows with multiple artists separated by ;;. We will have to separate each of these.
202
  .with_columns(pl.col("artists").str.split(";"))
203
+ # Similarly to the utility function you saw before, filter_genre is also defined in a later cell.
204
+ # While developing, you can add things out of order then go back to old cells and edit them
205
+ # it's up to you whenever to put them in whichever order makes the most sense to you.
206
  .filter(True if filter_genre.value is None else pl.col("track_genre").eq(filter_genre.value))
207
  .explode("artists")
208
  .group_by("artists")
 
222
  # And for good measure, see how many total tracks they have
223
  pl.col("track_name").n_unique().alias("tracks_count"),
224
  )
225
+ .sort("popularity", descending=True)
226
  .collect()
227
  )
228
+ mo.vstack(
229
+ [
230
+ mo.md("Let's start by taking a look at the most popular artists"),
231
+ # Also adjust the formatting for displaying columns that include multiple values in the same line
232
+ most_popular_artists.with_columns(pl.col(pl.List(pl.String())).list.join("\n")),
233
+ mo.md("Recognize any of your favourite songs? Me neither. Let's try adding a filter by genre"),
234
+ filter_genre,
235
+ mo.md(
236
+ "(the code is omitted for brevity, but you can click the eye icon to see it)",
237
+ ),
238
+ ],
 
 
 
 
 
 
 
 
 
 
 
239
  )
240
+ return (most_popular_artists,)
 
241
 
242
 
243
  @app.cell(hide_code=True)
 
284
  return
285
 
286
 
287
+ @app.cell(hide_code=True)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
288
  def _(
289
  alpha,
290
  color,
 
307
  opacity=alpha.value,
308
  trendline="lowess" if include_trendline.value else None,
309
  render_mode="webgl",
310
+ # strings on `hover` get fairly heavy when there are too many rows, but you can try using it after applying a few filters
311
+ # hover_name="track_name",
312
+ # hover_data=("artists", "album_name"),
313
  )
314
  chart2 = mo.ui.plotly(fig2)
315
+
316
+ mo.vstack([mo.hstack([x_axis, y_axis, color, alpha, include_trendline, filter_genre2]), chart2])
317
  return chart2, fig2
318
 
319
 
 
345
  return active_columns, column_order, out
346
 
347
 
348
+ @app.cell(hide_code=True)
349
  def _(mo):
350
  mo.md(
351
  r"""
 
359
  return
360
 
361
 
362
+ @app.cell(hide_code=True)
363
+ def _(mo):
364
+ mo.md(r"""# Utility Functions and UI Elements""")
365
+ return
366
+
367
+
368
+ @app.cell
369
+ def get_extremes():
370
+ def get_extremes(selection, col, defaults_if_missing):
371
+ "Get the minimum and maximum values for a given column within the selection"
372
+ if selection is None or len(selection) == 0:
373
+ print(
374
+ f"Could not find a selected region. Using default values {defaults_if_missing} instead, try clicking and dragging in the plot to change them."
375
+ )
376
+ return defaults_if_missing
377
+ else:
378
+ return (
379
+ min(row[col] for row in selection),
380
+ max(row[col] for row in selection),
381
+ )
382
+ return (get_extremes,)
383
+
384
+
385
+ @app.cell
386
+ def _(filtered_duration, mo):
387
+ filter_genre = mo.ui.dropdown(
388
+ options=filtered_duration["track_genre"].unique().sort().to_list(),
389
+ allow_select_none=True,
390
+ value=None,
391
+ searchable=True,
392
+ label="Filter by Track Genre:",
393
+ )
394
+ return (filter_genre,)
395
+
396
+
397
+ @app.cell
398
+ def _(filtered_duration, mo):
399
+ # Columns that make sense for the scatterplot and the corresponding UI elements
400
+ options = [
401
+ "duration_seconds",
402
+ "popularity",
403
+ "danceability",
404
+ "energy",
405
+ "key",
406
+ "loudness",
407
+ "mode",
408
+ "speechiness",
409
+ "acousticness",
410
+ "instrumentalness",
411
+ "liveness",
412
+ "valence",
413
+ "tempo",
414
+ ]
415
+ x_axis = mo.ui.dropdown(options, value="energy", label="X")
416
+ y_axis = mo.ui.dropdown(options, value="danceability", label="Y")
417
+ color = mo.ui.dropdown(options, value="loudness", allow_select_none=True, searchable=True, label="Color column")
418
+ alpha = mo.ui.slider(start=0.01, stop=1.0, step=0.01, value=0.1, label="Alpha", show_value=True)
419
+ include_trendline = mo.ui.checkbox(label="Trendline")
420
+ # We *could* reuse the same filter_genre from above, but it would cause marimo to rerun both the table and the graph whenever we change it
421
+ filter_genre2 = mo.ui.dropdown(
422
+ options=filtered_duration["track_genre"].unique().sort().to_list(),
423
+ allow_select_none=True,
424
+ value=None,
425
+ searchable=True,
426
+ label="Filter by Track Genre:",
427
+ )
428
+ return (
429
+ alpha,
430
+ color,
431
+ filter_genre2,
432
+ include_trendline,
433
+ options,
434
+ x_axis,
435
+ y_axis,
436
+ )
437
+
438
+
439
  @app.cell(hide_code=True)
440
  def _(mo):
441
  mo.md("""# Appendix : Some other examples""")