etrotta commited on
Commit
e5eb18d
·
1 Parent(s): 2418d37

Minor formatting and markdown changes

Browse files
Files changed (1) hide show
  1. polars/05_reactive_plots.py +45 -19
polars/05_reactive_plots.py CHANGED
@@ -4,14 +4,14 @@
4
  # "marimo",
5
  # "numpy==2.2.3",
6
  # "plotly[express]==6.0.0",
7
- # "polars==1.26.0",
8
  # "statsmodels==0.14.4",
9
  # ]
10
  # ///
11
 
12
  import marimo
13
 
14
- __generated_with = "0.11.26"
15
  app = marimo.App(width="medium")
16
 
17
 
@@ -19,9 +19,16 @@ app = marimo.App(width="medium")
19
  def _(mo):
20
  mo.md(
21
  """
22
- For this tutorial, we will be using the a [Spotify Tracks dataset](https://huggingface.co/datasets/maharshipandya/spotify-tracks-dataset).
23
 
24
- Note that it does not contains data about ***all*** tracks, you can try using a larger dataset such as [bigdata-pw/Spotify](https://huggingface.co/datasets/bigdata-pw/Spotify), but I'm sticking with the smaller one to keep the notebook size managable for most users.
 
 
 
 
 
 
 
25
  """
26
  )
27
  return
@@ -47,11 +54,12 @@ def _(pl):
47
  def _(mo):
48
  mo.md(
49
  """
50
- You should always take a look at the data you are working on before actually doing any operations on it - for data coming from sources such as HuggingFace or Kaggle you may want to look in their websites, then filter or do some transformations before downloading.
51
 
52
- The Polars Lazy engine allows for you define operations before loading the data, and polars will optimize the plan in order to avoid doing unnecessary operations
53
 
54
- Let's say that looking at it in the Data Viewer, we decided we do not want the Unnamed column (which appears to be the row index), nor do we care about the original ID, and we only want non-explicit tracks.
 
 
55
  """
56
  )
57
  return
@@ -82,12 +90,13 @@ def _(lz, pl):
82
  def _(mo):
83
  mo.md(
84
  r"""
 
85
  When you start exploring a dataset, some of the first things to do may include:
86
 
87
  - investigating any values that seem weird
88
  - verifying if there could be issues in the data
89
  - checking for potential bugs in our pipelines
90
- - ensuring you understand the data correctly, includign its relationships and edge cases
91
 
92
  For example, the "min" value for the duration column is zero, and the max is over an hour. Why is that?
93
  """
@@ -95,7 +104,7 @@ def _(mo):
95
  return
96
 
97
 
98
- @app.cell(disabled=True)
99
  def _(df, pl):
100
  # We *could* just filter some of the rows and look at them as a table, for example...
101
  pl.concat([df.sort("duration_ms").head(5), df.sort("duration_ms", descending=True).head(5)])
@@ -107,7 +116,7 @@ def _(df, pl):
107
  def _(mo):
108
  mo.md(
109
  r"""
110
- For this Notebook we will be using [plotly](https://plotly.com/python), but Marimo also supports some other plotting libraries, read the documentation to learn more later.
111
 
112
  Let's visualize it using a [bar chart](https://plotly.com/python/bar-charts/) and get a feel for which region makes sense to focus on for our analysis
113
  """
@@ -139,9 +148,9 @@ def _(mo):
139
  return
140
 
141
 
142
- @app.cell
143
  def _(pl, plot):
144
- # Taking a look at the selection:
145
  pl.DataFrame(plot.value)
146
  return
147
 
@@ -176,7 +185,7 @@ def _(df, pl, plot):
176
  def _(mo):
177
  mo.md(
178
  r"""
179
- Now that our data is clean, let's start coming up with and answering some questions about it. Some examples:
180
 
181
  - Which tracks or artists are the most popular? (Both globally as well as for each genre)
182
  - Which genres are the most popular? The loudest?
@@ -323,7 +332,7 @@ def _(filtered_duration, mo):
323
  searchable=True,
324
  label="Filter by Track Genre:",
325
  )
326
- x_axis, y_axis, color, alpha, include_trendline, filter_genre2
327
  return (
328
  alpha,
329
  color,
@@ -392,6 +401,20 @@ def _(chart2, filtered_duration, mo, pl):
392
  return active_columns, column_order, out
393
 
394
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
395
  @app.cell(hide_code=True)
396
  def _(mo):
397
  mo.md("""# Appendix : Some other examples""")
@@ -457,7 +480,7 @@ def _(filter_artist, filter_track, filtered_duration, mo, pl):
457
  .sort("match_score", descending=True)
458
  )
459
 
460
- mo.md("Filter a track based on its name or artist"), filter_artist, filter_track, filtered_artist_track
461
  return filtered_artist_track, score_match_text
462
 
463
 
@@ -479,10 +502,13 @@ def _(filter_genre2, filtered_duration, mo, pl):
479
  .len("count")
480
  .collect()
481
  )
482
- (
483
- mo.md("Check which artists collaborate with others most often (reuses the last genre filter)"),
484
- filter_genre2,
485
- artist_combinations.sort("count", descending=True),
 
 
 
486
  )
487
  return (artist_combinations,)
488
 
 
4
  # "marimo",
5
  # "numpy==2.2.3",
6
  # "plotly[express]==6.0.0",
7
+ # "polars==1.27.1",
8
  # "statsmodels==0.14.4",
9
  # ]
10
  # ///
11
 
12
  import marimo
13
 
14
+ __generated_with = "0.12.8"
15
  app = marimo.App(width="medium")
16
 
17
 
 
19
  def _(mo):
20
  mo.md(
21
  """
22
+ # Reactive Plots
23
 
24
+ _By [etrotta](https://github.com/etrotta)._
25
+
26
+ This tutorial covers Data Visualisation basics using marimo, [polars](https://docs.pola.rs/) and [plotly](https://plotly.com/python/plotly-express/).
27
+ It shows how to load data, explore and visualise it, then use User Interface elements (including the plots themselves) to filter and select data for more refined analysis.
28
+
29
+ We will be using a [Spotify Tracks dataset](https://huggingface.co/datasets/maharshipandya/spotify-tracks-dataset). Before you write any code yourself, I recommend taking some time to understand the data you're working with, from which columns are available to what are their possible values, as well as more abstract details such as the scope, coverage and intended uses of the dataset.
30
+
31
+ Note that this dataset does not contains data about ***all*** tracks, you can try using a larger dataset such as [bigdata-pw/Spotify](https://huggingface.co/datasets/bigdata-pw/Spotify), but I'm sticking with the smaller one to keep the notebook size managable for most users.
32
  """
33
  )
34
  return
 
54
  def _(mo):
55
  mo.md(
56
  """
 
57
 
58
+ You should always take a look at the data you are working on before actually doing any operations on it - for data coming from sources such as HuggingFace or Kaggle can preview it via their websites, and optionally filter or do some transformations before downloading.
59
 
60
+ The [Polars Lazy API](https://docs.pola.rs/user-guide/lazy/) allows for you define operations before loading the data, and polars will optimize the plan in order to avoid doing unnecessary operations or loading data we do not care about.
61
+
62
+ Let's say that looking at the dataset's preview in the Data Viewer, we decided we do not want the Unnamed column (which appears to be the row index), nor do we care about the original ID, and we only want non-explicit tracks.
63
  """
64
  )
65
  return
 
90
  def _(mo):
91
  mo.md(
92
  r"""
93
+
94
  When you start exploring a dataset, some of the first things to do may include:
95
 
96
  - investigating any values that seem weird
97
  - verifying if there could be issues in the data
98
  - checking for potential bugs in our pipelines
99
+ - ensuring you understand the data correctly, including its relationships and edge cases
100
 
101
  For example, the "min" value for the duration column is zero, and the max is over an hour. Why is that?
102
  """
 
104
  return
105
 
106
 
107
+ @app.cell(disabled=True, hide_code=True)
108
  def _(df, pl):
109
  # We *could* just filter some of the rows and look at them as a table, for example...
110
  pl.concat([df.sort("duration_ms").head(5), df.sort("duration_ms", descending=True).head(5)])
 
116
  def _(mo):
117
  mo.md(
118
  r"""
119
+ For this Notebook we will be using [plotly](https://plotly.com/python), but Marimo also [supports other plotting libraries](https://docs.marimo.io/guides/working_with_data/plotting/).
120
 
121
  Let's visualize it using a [bar chart](https://plotly.com/python/bar-charts/) and get a feel for which region makes sense to focus on for our analysis
122
  """
 
148
  return
149
 
150
 
151
+ @app.cell(disabled=True, hide_code=True)
152
  def _(pl, plot):
153
+ # If you want to see the selection itself
154
  pl.DataFrame(plot.value)
155
  return
156
 
 
185
  def _(mo):
186
  mo.md(
187
  r"""
188
+ Now that our data is 'clean', let's start coming up with and answering some questions about it. Some examples:
189
 
190
  - Which tracks or artists are the most popular? (Both globally as well as for each genre)
191
  - Which genres are the most popular? The loudest?
 
332
  searchable=True,
333
  label="Filter by Track Genre:",
334
  )
335
+ mo.vstack([x_axis, y_axis, color, alpha, include_trendline, filter_genre2])
336
  return (
337
  alpha,
338
  color,
 
401
  return active_columns, column_order, out
402
 
403
 
404
+ @app.cell
405
+ def _(mo):
406
+ mo.md(
407
+ r"""
408
+ Reviewing what we have covered in this Notebook:
409
+
410
+ - Understand the data you're working with first and foremost
411
+ - Creating plots can help you understand patterns, identify outliers and observe trends
412
+ - Thanks to marimo interactive UI elements we can explore multiple facets of the data without changing the code
413
+ """
414
+ )
415
+ return
416
+
417
+
418
  @app.cell(hide_code=True)
419
  def _(mo):
420
  mo.md("""# Appendix : Some other examples""")
 
480
  .sort("match_score", descending=True)
481
  )
482
 
483
+ mo.vstack([mo.md("Filter a track based on its name or artist"), filter_artist, filter_track, filtered_artist_track])
484
  return filtered_artist_track, score_match_text
485
 
486
 
 
502
  .len("count")
503
  .collect()
504
  )
505
+ mo.vstack(
506
+ [
507
+ mo.md("Check which artists collaborate with others most often (reuses the last genre filter)"),
508
+ filter_genre2,
509
+ artist_combinations.sort("count", descending=True),
510
+ ],
511
+ align="center",
512
  )
513
  return (artist_combinations,)
514