abubasith86 commited on
Commit
df51149
Β·
verified Β·
1 Parent(s): e40e1e1

Update app.py

Browse files
Files changed (1) hide show
  1. app.py +152 -111
app.py CHANGED
@@ -8,6 +8,7 @@ st.title("πŸ“š JSONL Dataset Editor")
8
 
9
  TMP_DIR = "temp"
10
  TMP_FILE = os.path.join(TMP_DIR, "session_dataset.jsonl")
 
11
 
12
  # --- Helper: ensure tmp dir exists ---
13
  os.makedirs(TMP_DIR, exist_ok=True)
@@ -21,163 +22,203 @@ def get_all_fields(data):
21
  return sorted(all_keys)
22
 
23
 
 
 
 
 
 
 
 
 
 
24
  # --- Load session data from temp file if exists ---
25
  if "data" not in st.session_state:
26
  if os.path.exists(TMP_FILE):
27
  with open(TMP_FILE, "r", encoding="utf-8") as f:
28
  st.session_state.data = [json.loads(line) for line in f]
29
- st.session_state.all_fields = get_all_fields(st.session_state.data)
30
  else:
31
  st.session_state.data = []
32
- st.session_state.all_fields = []
 
 
 
 
 
 
 
 
 
 
 
 
 
 
33
 
34
  # --- Upload JSONL File ---
35
  uploaded_file = st.file_uploader("Upload a JSONL file", type=["jsonl"])
36
 
37
  if uploaded_file:
 
38
  content = uploaded_file.read().decode("utf-8")
39
  st.session_state.data = [json.loads(line) for line in content.strip().splitlines()]
 
 
40
  st.session_state.all_fields = get_all_fields(st.session_state.data)
41
 
42
- # Save to temp
 
 
 
 
 
 
 
 
 
 
 
 
 
43
  with open(TMP_FILE, "w", encoding="utf-8") as f:
44
  for item in st.session_state.data:
45
  f.write(json.dumps(item, ensure_ascii=False) + "\n")
46
 
 
 
 
 
 
 
 
 
 
 
 
47
  st.success(
48
  f"Loaded {len(st.session_state.data)} records with fields: {st.session_state.all_fields}"
49
  )
50
 
51
  # If still no data, use safe fallback fields
52
  if not st.session_state.data and not st.session_state.all_fields:
53
- st.session_state.all_fields = ["context", "question", "answer"]
54
 
55
  # --- Edit Existing Records ---
56
- st.markdown("### ✏️ Edit Records")
 
57
 
58
- df = pd.DataFrame(st.session_state.data)
59
- df = df.reindex(columns=st.session_state.all_fields)
60
 
61
- # Fix: Convert likely text fields to string to avoid StreamlitAPIException
62
- for field in st.session_state.all_fields:
63
- if field.lower() in ["context", "answer", "question"]:
64
  df[field] = df[field].astype(str)
65
 
66
- # Auto-set long fields like "context", "answer" as textareas
67
- column_configs = {
68
- field: (
69
- st.column_config.TextColumn(label=field, width="large")
70
- if field.lower() in ["context", "answer", "question"]
71
- else None
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
72
  )
73
- for field in st.session_state.all_fields
74
- }
75
-
76
- # --- Use st.data_editor for editable table ---
77
- edited_df = st.data_editor(
78
- df,
79
- use_container_width=True,
80
- num_rows="dynamic",
81
- column_config=column_configs,
82
- )
83
-
84
- # Auto-save logic: detect changes and persist
85
- if not edited_df.equals(df):
86
- st.session_state.data = edited_df.fillna("").to_dict(orient="records")
87
-
88
- # Save to temp file
89
- with open(TMP_FILE, "w", encoding="utf-8") as f:
90
- for item in st.session_state.data:
91
- f.write(json.dumps(item, ensure_ascii=False) + "\n")
92
-
93
- st.toast("βœ… Auto-saved!", icon="πŸ’Ύ")
94
- st.rerun()
95
-
96
-
97
- # --- Add New Entry ---
98
- st.markdown("### βž• Add New Entry")
99
 
100
- # Show form with current fields
101
- with st.form("new_entry_form"):
102
- new_record = {}
103
- for field in st.session_state.all_fields:
104
- new_record[field] = st.text_area(f"{field}", key=f"input_{field}")
105
-
106
- submitted = st.form_submit_button("Add Entry")
107
- if submitted:
108
- st.session_state.data.append(new_record)
109
-
110
- # Save to temp
111
  with open(TMP_FILE, "w", encoding="utf-8") as f:
112
  for item in st.session_state.data:
113
  f.write(json.dumps(item, ensure_ascii=False) + "\n")
114
-
115
- st.success("βœ… New entry added!")
116
  st.rerun()
117
 
118
- # Option to add a new field
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
119
  with st.expander("βž• Add New Field"):
120
- new_field = st.text_input("New field name", key="new_field_name")
 
121
  if st.button("Add Field"):
122
  if new_field and new_field not in st.session_state.all_fields:
123
  st.session_state.all_fields.append(new_field)
 
 
 
 
 
 
 
 
124
  st.success(f"βœ… Field '{new_field}' added!")
125
  st.rerun()
126
 
 
 
127
 
128
- # --- Export JSONL ---
129
- st.markdown("### πŸ“€ Export Dataset")
130
-
131
- # Let user define a custom export path
132
- export_path = st.text_input(
133
- "Custom save path (e.g., ./exports/my_dataset.jsonl)",
134
- value="./exports/exported_dataset.jsonl",
135
  )
136
 
137
- col1, col2 = st.columns(2)
138
-
139
- # --- Export Button ---
140
- with col1:
141
- if st.button("πŸ“ Export JSONL"):
142
- if not os.path.exists(os.path.dirname(export_path)):
143
- os.makedirs(os.path.dirname(export_path))
144
-
145
- # Write to custom path
146
- with open(export_path, "w", encoding="utf-8") as f_out:
147
- for row in st.session_state.data:
148
- f_out.write(json.dumps(row, ensure_ascii=False) + "\n")
149
-
150
- st.success(f"βœ… Dataset saved to {export_path}")
151
-
152
- # Load content for download
153
- with open(export_path, "r", encoding="utf-8") as f_download:
154
- exported_content = f_download.read()
155
-
156
- st.download_button(
157
- "⬇️ Download JSONL",
158
- exported_content,
159
- file_name=os.path.basename(export_path),
160
- mime="application/json",
161
- )
162
-
163
- # Reset session and temp
164
- if os.path.exists(TMP_FILE):
165
- os.remove(TMP_FILE)
166
- st.session_state.clear()
167
- st.success("🧹 Temporary session cleared. You're starting fresh!")
168
- st.rerun()
169
-
170
- # --- Download Temp Only Button ---
171
- with col2:
172
- if os.path.exists(TMP_FILE):
173
- with open(TMP_FILE, "r", encoding="utf-8") as f_tmp:
174
- tmp_content = f_tmp.read()
175
-
176
- st.download_button(
177
- "⬇️ Download Temp File",
178
- tmp_content,
179
- file_name="session_dataset.jsonl",
180
- mime="application/json",
181
- )
182
- else:
183
- st.warning("⚠️ No temp file found to download.")
 
8
 
9
  TMP_DIR = "temp"
10
  TMP_FILE = os.path.join(TMP_DIR, "session_dataset.jsonl")
11
+ META_FILE = os.path.join(TMP_DIR, "metadata.json")
12
 
13
  # --- Helper: ensure tmp dir exists ---
14
  os.makedirs(TMP_DIR, exist_ok=True)
 
22
  return sorted(all_keys)
23
 
24
 
25
+ if st.button("πŸ”„ Reset Session"):
26
+ st.session_state.clear() # Clear session state
27
+ if os.path.exists(TMP_FILE):
28
+ os.remove(TMP_FILE) # Remove temporary file
29
+ if os.path.exists(META_FILE):
30
+ os.remove(META_FILE) # Remove metadata file
31
+ st.success("🧹 Session has been reset. Starting fresh!")
32
+ st.rerun() # Rerun the app to reset everything
33
+
34
  # --- Load session data from temp file if exists ---
35
  if "data" not in st.session_state:
36
  if os.path.exists(TMP_FILE):
37
  with open(TMP_FILE, "r", encoding="utf-8") as f:
38
  st.session_state.data = [json.loads(line) for line in f]
 
39
  else:
40
  st.session_state.data = []
41
+
42
+ if os.path.exists(META_FILE):
43
+ with open(META_FILE, "r", encoding="utf-8") as f:
44
+ metadata = json.load(f)
45
+ st.session_state.all_fields = [
46
+ f["name"] for f in metadata.get("fields", [])
47
+ ]
48
+ st.session_state.field_types = {
49
+ f["name"]: f.get("type", "text") for f in metadata.get("fields", [])
50
+ }
51
+ else:
52
+ st.session_state.all_fields = get_all_fields(st.session_state.data)
53
+ st.session_state.field_types = {
54
+ field: "text" for field in st.session_state.all_fields
55
+ }
56
 
57
  # --- Upload JSONL File ---
58
  uploaded_file = st.file_uploader("Upload a JSONL file", type=["jsonl"])
59
 
60
  if uploaded_file:
61
+ # Read the uploaded content
62
  content = uploaded_file.read().decode("utf-8")
63
  st.session_state.data = [json.loads(line) for line in content.strip().splitlines()]
64
+
65
+ # Update all fields and field types
66
  st.session_state.all_fields = get_all_fields(st.session_state.data)
67
 
68
+ # Update field_types based on metadata if it exists, otherwise set them to 'text'
69
+ if "field_types" in st.session_state:
70
+ for field in st.session_state.all_fields:
71
+ if field not in st.session_state.field_types:
72
+ st.session_state.field_types[field] = (
73
+ "textarea" # default type for new fields
74
+ )
75
+ else:
76
+ # Initialize field_types for all fields with "text" if no metadata exists
77
+ st.session_state.field_types = {
78
+ field: "text" for field in st.session_state.all_fields
79
+ }
80
+
81
+ # Save the updated data and metadata
82
  with open(TMP_FILE, "w", encoding="utf-8") as f:
83
  for item in st.session_state.data:
84
  f.write(json.dumps(item, ensure_ascii=False) + "\n")
85
 
86
+ # Save metadata (field types)
87
+ metadata = {
88
+ "fields": [
89
+ {"name": field, "type": st.session_state.field_types.get(field, "text")}
90
+ for field in st.session_state.all_fields
91
+ ]
92
+ }
93
+ with open(META_FILE, "w", encoding="utf-8") as f:
94
+ json.dump(metadata, f, indent=2, ensure_ascii=False)
95
+
96
+ # Success message
97
  st.success(
98
  f"Loaded {len(st.session_state.data)} records with fields: {st.session_state.all_fields}"
99
  )
100
 
101
  # If still no data, use safe fallback fields
102
  if not st.session_state.data and not st.session_state.all_fields:
103
+ st.session_state.all_fields = []
104
 
105
  # --- Edit Existing Records ---
106
+ if st.session_state.data:
107
+ st.markdown("### ✏️ Edit Records")
108
 
109
+ df = pd.DataFrame(st.session_state.data)
110
+ df = df.reindex(columns=st.session_state.all_fields)
111
 
112
+ # Convert all columns to string for safe editing
113
+ for field in st.session_state.all_fields:
 
114
  df[field] = df[field].astype(str)
115
 
116
+ # Ensure field_types is initialized
117
+ if "field_types" not in st.session_state:
118
+ st.session_state.field_types = {
119
+ field: "text" for field in st.session_state.all_fields
120
+ }
121
+
122
+ # Build dynamic column config based on metadata
123
+ column_configs = {
124
+ field: (
125
+ st.column_config.TextColumn(label=field, width="large")
126
+ if st.session_state.field_types.get(field) == "textarea"
127
+ else st.column_config.TextColumn(label=field)
128
+ )
129
+ for field in st.session_state.all_fields
130
+ }
131
+
132
+ edited_df = st.data_editor(
133
+ df,
134
+ use_container_width=True,
135
+ num_rows="dynamic",
136
+ column_config=column_configs,
137
  )
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
138
 
139
+ if not edited_df.equals(df):
140
+ st.session_state.data = edited_df.fillna("").to_dict(orient="records")
 
 
 
 
 
 
 
 
 
141
  with open(TMP_FILE, "w", encoding="utf-8") as f:
142
  for item in st.session_state.data:
143
  f.write(json.dumps(item, ensure_ascii=False) + "\n")
144
+ st.toast("βœ… Auto-saved!", icon="πŸ’Ύ")
 
145
  st.rerun()
146
 
147
+ # --- Add New Entry ---
148
+ if st.session_state.all_fields:
149
+ st.markdown("### βž• Add New Entry")
150
+
151
+ with st.form("new_entry_form"):
152
+ new_record = {}
153
+
154
+ # Clear form fields if reset flag is set
155
+ if "reset_form" in st.session_state and st.session_state.reset_form:
156
+ # Clear the session state for each input field
157
+ for field in st.session_state.all_fields:
158
+ st.session_state[f"input_{field}"] = ""
159
+ st.session_state.reset_form = (
160
+ False # Reset the reset flag after clearing fields
161
+ )
162
+
163
+ # Collect new record input from the user
164
+ for field in st.session_state.all_fields:
165
+ input_type = st.session_state.field_types.get(field, "text")
166
+ if input_type == "textarea":
167
+ new_record[field] = st.text_area(f"{field}", key=f"input_{field}")
168
+ else:
169
+ new_record[field] = st.text_input(f"{field}", key=f"input_{field}")
170
+
171
+ submitted = st.form_submit_button("Add Entry")
172
+
173
+ if submitted:
174
+ # Append the new record to session data
175
+ st.session_state.data.append(new_record)
176
+
177
+ # Save the updated data to a temp file
178
+ with open(TMP_FILE, "w", encoding="utf-8") as f:
179
+ for item in st.session_state.data:
180
+ f.write(json.dumps(item, ensure_ascii=False) + "\n")
181
+
182
+ # Set the reset flag to clear the form next time
183
+ st.session_state.reset_form = True
184
+
185
+ # Show success message and re-run to clear form fields
186
+ st.success("βœ… New entry added!")
187
+ st.rerun()
188
+
189
+ # --- Add New Field ---
190
  with st.expander("βž• Add New Field"):
191
+ new_field = st.text_input("Field name", key="new_field_name")
192
+ new_type = st.selectbox("Field type", ["text", "textarea"], key="new_field_type")
193
  if st.button("Add Field"):
194
  if new_field and new_field not in st.session_state.all_fields:
195
  st.session_state.all_fields.append(new_field)
196
+ st.session_state.field_types[new_field] = new_type
197
+
198
+ fields_metadata = [
199
+ {"name": f, "type": st.session_state.field_types[f]}
200
+ for f in st.session_state.all_fields
201
+ ]
202
+ with open(META_FILE, "w", encoding="utf-8") as f:
203
+ json.dump({"fields": fields_metadata}, f, indent=2, ensure_ascii=False)
204
  st.success(f"βœ… Field '{new_field}' added!")
205
  st.rerun()
206
 
207
+ # --- Download Dataset Button ---
208
+ st.markdown("### πŸ“€ Download Dataset")
209
 
210
+ # Read the session data as a JSONL string
211
+ dataset_content = "\n".join(
212
+ [json.dumps(row, ensure_ascii=False) for row in st.session_state.data]
 
 
 
 
213
  )
214
 
215
+ if os.path.exists(TMP_FILE):
216
+ # Provide the download button
217
+ st.download_button(
218
+ label="⬇️ Download Dataset as JSONL",
219
+ data=dataset_content,
220
+ file_name="session_dataset.jsonl",
221
+ mime="application/json",
222
+ )
223
+ else:
224
+ st.warning("Dataset not yet generated!")