merve HF Staff commited on
Commit
143d008
·
1 Parent(s): 76eb760

Upload app.py

Browse files
Files changed (1) hide show
  1. app.py +43 -14
app.py CHANGED
@@ -1,18 +1,29 @@
1
- from datasets import load_dataset
2
  import streamlit as st
3
  import pandas as pd
4
  import re
5
  import nltk
 
 
 
 
6
  from wordcloud import WordCloud, STOPWORDS
7
  from nltk.corpus import stopwords
8
- nltk.download("stopwords")
9
- stop = stopwords.words('english')
 
 
10
 
11
- dataset = load_dataset("huggingartists/gorillaz")
 
12
  df = pd.DataFrame.from_dict(dataset["train"])
13
 
14
- st.dataframe(df)
15
- st.write("Removed special characters")
 
 
 
 
16
 
17
  def standardize(text, remove_digits=True):
18
  text=re.sub('[^a-zA-Z\d\s]', '',text)
@@ -20,23 +31,41 @@ def standardize(text, remove_digits=True):
20
 
21
  return text
22
 
23
- df.text = df.text.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
24
- df.text=df.text.apply(standardize)
 
25
  st.dataframe(df)
26
 
27
- words = df.text.str.split(expand=True).unstack().value_counts()
28
- st.bar_chart(words[20:40])
 
 
29
  st.set_option('deprecation.showPyplotGlobalUse', False)
30
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
31
  import matplotlib.pyplot as plt
32
  def word_cloud(content, title):
33
- wc = WordCloud(background_color='white', max_words=200,
34
- stopwords=STOPWORDS, max_font_size=50)
35
  wc.generate(" ".join(content.index.values))
36
- fig = plt.figure(figsize=(16, 13))
37
  plt.title(title, fontsize=20)
38
- plt.imshow(wc.recolor(colormap='Pastel2', random_state=42), alpha=0.98)
39
  plt.axis('off')
40
  st.pyplot()
41
 
 
42
  word_cloud(words, "Word Cloud")
 
1
+
2
  import streamlit as st
3
  import pandas as pd
4
  import re
5
  import nltk
6
+ from PIL import Image
7
+ import os
8
+ import numpy as np
9
+ import seaborn as sns
10
  from wordcloud import WordCloud, STOPWORDS
11
  from nltk.corpus import stopwords
12
+ import datasets
13
+ from datasets import load_dataset
14
+ import sklearn
15
+ from sklearn.preprocessing import LabelEncoder
16
 
17
+ # loading dataset
18
+ dataset = load_dataset("merve/poetry", streaming=True)
19
  df = pd.DataFrame.from_dict(dataset["train"])
20
 
21
+
22
+ d = os.path.dirname(__file__) if "__file__" in locals() else os.getcwd()
23
+ nltk.download("stopwords")
24
+ stop = stopwords.words('english')
25
+
26
+ # standardizing dataset by removing special characters and lowercasing
27
 
28
  def standardize(text, remove_digits=True):
29
  text=re.sub('[^a-zA-Z\d\s]', '',text)
 
31
 
32
  return text
33
 
34
+ st.write("Poetry dataset, content column cleaned from special characters and lowercased")
35
+ df.content = df.content.apply(lambda x: ' '.join([word for word in x.split() if word not in (stop)]))
36
+ df.content=df.content.apply(standardize)
37
  st.dataframe(df)
38
 
39
+ #most appearing words including stopwords
40
+ st.write("Most appearing words including stopwords")
41
+ words = df.content.str.split(expand=True).unstack().value_counts()
42
+ st.bar_chart(words[0:50])
43
  st.set_option('deprecation.showPyplotGlobalUse', False)
44
 
45
+
46
+
47
+ mask = np.array(Image.open(os.path.join(d, "poet.png")))
48
+
49
+ # distributions of poem types according to ages and authors
50
+ st.write("Distributions of poem types according to ages and authors")
51
+ le = LabelEncoder()
52
+
53
+ df.author = le.fit_transform(df.author)
54
+ sns.catplot(x="age", y="author",hue="type", data=df)
55
+ st.pyplot()
56
+
57
+ # most appearing words other than stop words
58
+
59
  import matplotlib.pyplot as plt
60
  def word_cloud(content, title):
61
+ wc = WordCloud(background_color="white", max_words=200,contour_width=3,
62
+ stopwords=STOPWORDS, mask = mask, max_font_size=50)
63
  wc.generate(" ".join(content.index.values))
64
+ fig = plt.figure(figsize=(10, 10))
65
  plt.title(title, fontsize=20)
66
+ plt.imshow(wc.recolor(colormap='magma', random_state=42), cmap=plt.cm.gray, interpolation = "bilinear", alpha=0.98)
67
  plt.axis('off')
68
  st.pyplot()
69
 
70
+ st.write("Most appearing words excluding stopwords")
71
  word_cloud(words, "Word Cloud")