abubasith86 commited on
Commit
e40e1e1
Β·
verified Β·
1 Parent(s): 790f5b7

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +127 -96
app.py CHANGED
@@ -2,151 +2,182 @@ import streamlit as st
2
  import json
3
  import pandas as pd
4
  import os
5
- from uuid import uuid4
6
 
7
  st.set_page_config(page_title="Dataset Builder", layout="wide")
8
  st.title("πŸ“š JSONL Dataset Editor")
9
 
10
  TMP_DIR = "temp"
11
  TMP_FILE = os.path.join(TMP_DIR, "session_dataset.jsonl")
 
 
12
  os.makedirs(TMP_DIR, exist_ok=True)
13
 
14
 
15
- # --- Helpers ---
16
  def get_all_fields(data):
17
- keys = set()
18
- for d in data:
19
- keys.update(d.keys())
20
- return sorted(list(keys))
21
-
22
-
23
- def save_to_file():
24
- with open(TMP_FILE, "w", encoding="utf-8") as f:
25
- for row in st.session_state.data:
26
- f.write(json.dumps(row, ensure_ascii=False) + "\n")
27
 
28
 
29
- # --- Session Initialization ---
30
  if "data" not in st.session_state:
31
- st.session_state.data = []
32
- if "all_fields" not in st.session_state:
33
- st.session_state.all_fields = []
34
- if "editor_key" not in st.session_state:
35
- st.session_state.editor_key = str(uuid4())
36
- if "just_added" not in st.session_state:
37
- st.session_state.just_added = False
38
-
39
- # --- Load from TMP file ---
40
- if os.path.exists(TMP_FILE) and not st.session_state.data:
41
- with open(TMP_FILE, "r", encoding="utf-8") as f:
42
- st.session_state.data = [json.loads(line) for line in f]
43
- st.session_state.all_fields = get_all_fields(st.session_state.data)
44
-
45
- # --- File Upload ---
46
- uploaded = st.file_uploader("Upload JSONL", type=["jsonl"])
47
- if uploaded:
48
- lines = uploaded.read().decode("utf-8").splitlines()
49
- st.session_state.data = [json.loads(l) for l in lines]
50
  st.session_state.all_fields = get_all_fields(st.session_state.data)
51
- save_to_file()
52
- st.session_state.editor_key = str(uuid4())
53
- st.session_state.just_added = True
54
- st.rerun() # Force rerun to update editor
55
 
56
- # --- Add New Fields fallback ---
57
- if not st.session_state.all_fields:
58
- st.session_state.all_fields = ["context", "question", "answer"]
59
-
60
- # --- Add Entry Form ---
61
- with st.form("add_form"):
62
- st.markdown("### βž• Add New Entry")
63
- new_entry = {}
64
- for field in st.session_state.all_fields:
65
- new_entry[field] = st.text_area(field, key=f"add_{field}")
66
-
67
- submit_add = st.form_submit_button("Add Entry")
68
 
69
- if submit_add:
70
- st.session_state.data.append(new_entry)
71
- save_to_file()
72
- st.session_state.editor_key = str(uuid4())
73
- st.session_state.just_added = True
74
- st.rerun() # Force rerun to update editor
75
 
76
- # --- Wait for rerun before rendering editor ---
77
- if st.session_state.just_added:
78
- st.session_state.just_added = False
79
- st.rerun()
80
 
81
- # --- Display Editor ---
82
  st.markdown("### ✏️ Edit Records")
 
83
  df = pd.DataFrame(st.session_state.data)
84
  df = df.reindex(columns=st.session_state.all_fields)
85
 
 
86
  for field in st.session_state.all_fields:
87
- df[field] = df[field].astype(str)
 
88
 
 
89
  column_configs = {
90
- field: st.column_config.TextColumn(field, width="large")
91
- if field.lower() in ["context", "question", "answer"]
92
- else None
 
 
93
  for field in st.session_state.all_fields
94
  }
95
 
 
96
  edited_df = st.data_editor(
97
  df,
98
- key=st.session_state.editor_key,
99
  use_container_width=True,
100
  num_rows="dynamic",
101
  column_config=column_configs,
102
  )
103
 
104
- # --- Save edits immediately after editing ---
105
- if edited_df is not None:
106
- new_data = edited_df.fillna("").to_dict(orient="records")
107
- if new_data != st.session_state.data: # Ensure the data has changed before saving
108
- st.session_state.data = new_data
109
- save_to_file()
110
- st.toast("βœ… Changes auto-saved!", icon="πŸ’Ύ")
 
 
 
 
 
111
 
112
- # --- Add New Field ---
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
113
  with st.expander("βž• Add New Field"):
114
  new_field = st.text_input("New field name", key="new_field_name")
115
  if st.button("Add Field"):
116
  if new_field and new_field not in st.session_state.all_fields:
117
  st.session_state.all_fields.append(new_field)
118
- st.rerun() # Rerun after adding field
 
119
 
120
- # --- Export Dataset ---
121
- st.markdown("### πŸ“€ Export")
122
- export_path = st.text_input("Save path", value="./exports/exported_dataset.jsonl")
123
 
124
- col1, col2, col3 = st.columns(3)
 
125
 
 
 
 
 
 
 
 
 
 
126
  with col1:
127
  if st.button("πŸ“ Export JSONL"):
128
- os.makedirs(os.path.dirname(export_path), exist_ok=True)
129
- with open(export_path, "w", encoding="utf-8") as f:
 
 
 
130
  for row in st.session_state.data:
131
- f.write(json.dumps(row, ensure_ascii=False) + "\n")
132
- with open(export_path, "r", encoding="utf-8") as f:
133
- content = f.read()
134
- st.download_button("⬇️ Download JSONL", content, file_name=os.path.basename(export_path))
135
- st.success("βœ… Exported!")
136
- st.session_state.clear()
137
- if os.path.exists(TMP_FILE):
138
- os.remove(TMP_FILE)
139
- st.rerun() # Force rerun after export
140
 
141
- with col2:
142
- if os.path.exists(TMP_FILE):
143
- with open(TMP_FILE, "r", encoding="utf-8") as f:
144
- tmp_data = f.read()
145
- st.download_button("⬇️ Temp File", tmp_data, file_name="session_dataset.jsonl")
146
 
147
- with col3:
148
- if st.button("🧹 Clear Session"):
 
 
 
 
 
 
 
 
 
 
149
  if os.path.exists(TMP_FILE):
150
  os.remove(TMP_FILE)
151
  st.session_state.clear()
152
- st.rerun() # Rerun after clearing session
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
2
  import json
3
  import pandas as pd
4
  import os
 
5
 
6
  st.set_page_config(page_title="Dataset Builder", layout="wide")
7
  st.title("πŸ“š JSONL Dataset Editor")
8
 
9
  TMP_DIR = "temp"
10
  TMP_FILE = os.path.join(TMP_DIR, "session_dataset.jsonl")
11
+
12
+ # --- Helper: ensure tmp dir exists ---
13
  os.makedirs(TMP_DIR, exist_ok=True)
14
 
15
 
16
+ # --- Helper: get all unique fields from records ---
17
  def get_all_fields(data):
18
+ all_keys = set()
19
+ for record in data:
20
+ all_keys.update(record.keys())
21
+ return sorted(all_keys)
 
 
 
 
 
 
22
 
23
 
24
+ # --- Load session data from temp file if exists ---
25
  if "data" not in st.session_state:
26
+ if os.path.exists(TMP_FILE):
27
+ with open(TMP_FILE, "r", encoding="utf-8") as f:
28
+ st.session_state.data = [json.loads(line) for line in f]
29
+ st.session_state.all_fields = get_all_fields(st.session_state.data)
30
+ else:
31
+ st.session_state.data = []
32
+ st.session_state.all_fields = []
33
+
34
+ # --- Upload JSONL File ---
35
+ uploaded_file = st.file_uploader("Upload a JSONL file", type=["jsonl"])
36
+
37
+ if uploaded_file:
38
+ content = uploaded_file.read().decode("utf-8")
39
+ st.session_state.data = [json.loads(line) for line in content.strip().splitlines()]
 
 
 
 
 
40
  st.session_state.all_fields = get_all_fields(st.session_state.data)
 
 
 
 
41
 
42
+ # Save to temp
43
+ with open(TMP_FILE, "w", encoding="utf-8") as f:
44
+ for item in st.session_state.data:
45
+ f.write(json.dumps(item, ensure_ascii=False) + "\n")
 
 
 
 
 
 
 
 
46
 
47
+ st.success(
48
+ f"Loaded {len(st.session_state.data)} records with fields: {st.session_state.all_fields}"
49
+ )
 
 
 
50
 
51
+ # If still no data, use safe fallback fields
52
+ if not st.session_state.data and not st.session_state.all_fields:
53
+ st.session_state.all_fields = ["context", "question", "answer"]
 
54
 
55
+ # --- Edit Existing Records ---
56
  st.markdown("### ✏️ Edit Records")
57
+
58
  df = pd.DataFrame(st.session_state.data)
59
  df = df.reindex(columns=st.session_state.all_fields)
60
 
61
+ # Fix: Convert likely text fields to string to avoid StreamlitAPIException
62
  for field in st.session_state.all_fields:
63
+ if field.lower() in ["context", "answer", "question"]:
64
+ df[field] = df[field].astype(str)
65
 
66
+ # Auto-set long fields like "context", "answer" as textareas
67
  column_configs = {
68
+ field: (
69
+ st.column_config.TextColumn(label=field, width="large")
70
+ if field.lower() in ["context", "answer", "question"]
71
+ else None
72
+ )
73
  for field in st.session_state.all_fields
74
  }
75
 
76
+ # --- Use st.data_editor for editable table ---
77
  edited_df = st.data_editor(
78
  df,
 
79
  use_container_width=True,
80
  num_rows="dynamic",
81
  column_config=column_configs,
82
  )
83
 
84
+ # Auto-save logic: detect changes and persist
85
+ if not edited_df.equals(df):
86
+ st.session_state.data = edited_df.fillna("").to_dict(orient="records")
87
+
88
+ # Save to temp file
89
+ with open(TMP_FILE, "w", encoding="utf-8") as f:
90
+ for item in st.session_state.data:
91
+ f.write(json.dumps(item, ensure_ascii=False) + "\n")
92
+
93
+ st.toast("βœ… Auto-saved!", icon="πŸ’Ύ")
94
+ st.rerun()
95
+
96
 
97
+ # --- Add New Entry ---
98
+ st.markdown("### βž• Add New Entry")
99
+
100
+ # Show form with current fields
101
+ with st.form("new_entry_form"):
102
+ new_record = {}
103
+ for field in st.session_state.all_fields:
104
+ new_record[field] = st.text_area(f"{field}", key=f"input_{field}")
105
+
106
+ submitted = st.form_submit_button("Add Entry")
107
+ if submitted:
108
+ st.session_state.data.append(new_record)
109
+
110
+ # Save to temp
111
+ with open(TMP_FILE, "w", encoding="utf-8") as f:
112
+ for item in st.session_state.data:
113
+ f.write(json.dumps(item, ensure_ascii=False) + "\n")
114
+
115
+ st.success("βœ… New entry added!")
116
+ st.rerun()
117
+
118
+ # Option to add a new field
119
  with st.expander("βž• Add New Field"):
120
  new_field = st.text_input("New field name", key="new_field_name")
121
  if st.button("Add Field"):
122
  if new_field and new_field not in st.session_state.all_fields:
123
  st.session_state.all_fields.append(new_field)
124
+ st.success(f"βœ… Field '{new_field}' added!")
125
+ st.rerun()
126
 
 
 
 
127
 
128
+ # --- Export JSONL ---
129
+ st.markdown("### πŸ“€ Export Dataset")
130
 
131
+ # Let user define a custom export path
132
+ export_path = st.text_input(
133
+ "Custom save path (e.g., ./exports/my_dataset.jsonl)",
134
+ value="./exports/exported_dataset.jsonl",
135
+ )
136
+
137
+ col1, col2 = st.columns(2)
138
+
139
+ # --- Export Button ---
140
  with col1:
141
  if st.button("πŸ“ Export JSONL"):
142
+ if not os.path.exists(os.path.dirname(export_path)):
143
+ os.makedirs(os.path.dirname(export_path))
144
+
145
+ # Write to custom path
146
+ with open(export_path, "w", encoding="utf-8") as f_out:
147
  for row in st.session_state.data:
148
+ f_out.write(json.dumps(row, ensure_ascii=False) + "\n")
 
 
 
 
 
 
 
 
149
 
150
+ st.success(f"βœ… Dataset saved to {export_path}")
 
 
 
 
151
 
152
+ # Load content for download
153
+ with open(export_path, "r", encoding="utf-8") as f_download:
154
+ exported_content = f_download.read()
155
+
156
+ st.download_button(
157
+ "⬇️ Download JSONL",
158
+ exported_content,
159
+ file_name=os.path.basename(export_path),
160
+ mime="application/json",
161
+ )
162
+
163
+ # Reset session and temp
164
  if os.path.exists(TMP_FILE):
165
  os.remove(TMP_FILE)
166
  st.session_state.clear()
167
+ st.success("🧹 Temporary session cleared. You're starting fresh!")
168
+ st.rerun()
169
+
170
+ # --- Download Temp Only Button ---
171
+ with col2:
172
+ if os.path.exists(TMP_FILE):
173
+ with open(TMP_FILE, "r", encoding="utf-8") as f_tmp:
174
+ tmp_content = f_tmp.read()
175
+
176
+ st.download_button(
177
+ "⬇️ Download Temp File",
178
+ tmp_content,
179
+ file_name="session_dataset.jsonl",
180
+ mime="application/json",
181
+ )
182
+ else:
183
+ st.warning("⚠️ No temp file found to download.")