File size: 8,102 Bytes
28572de
 
 
 
 
4f1d48a
 
28572de
 
 
df51149
e40e1e1
 
28572de
 
f3a8b9c
e40e1e1
fc8b17b
e40e1e1
 
 
 
f3a8b9c
77d363f
df51149
 
 
 
 
 
 
 
 
e40e1e1
28572de
e40e1e1
 
 
 
 
df51149
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e40e1e1
 
 
 
 
df51149
e40e1e1
 
df51149
 
fc8b17b
28572de
df51149
 
 
 
 
 
 
 
 
 
 
 
 
 
e40e1e1
 
 
f3a8b9c
df51149
 
 
 
 
 
 
 
 
 
 
e40e1e1
 
 
f3a8b9c
e40e1e1
 
df51149
f3a8b9c
e40e1e1
df51149
 
e40e1e1
df51149
 
28572de
df51149
 
e40e1e1
28572de
df51149
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
e40e1e1
 
df51149
 
e40e1e1
 
 
df51149
e40e1e1
 
df51149
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
28572de
df51149
 
28572de
 
 
df51149
 
 
 
 
 
 
 
e40e1e1
 
28572de
df51149
 
28572de
df51149
 
 
e40e1e1
 
df51149
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
import streamlit as st
import json
import pandas as pd
import os

st.set_page_config(page_title="Dataset Builder and Editor", layout="wide")
st.title("πŸ“š JSONL Dataset Builder and Editor")

TMP_DIR = "temp"
TMP_FILE = os.path.join(TMP_DIR, "session_dataset.jsonl")
META_FILE = os.path.join(TMP_DIR, "metadata.json")

# --- Helper: ensure tmp dir exists ---
os.makedirs(TMP_DIR, exist_ok=True)


# --- Helper: get all unique fields from records ---
def get_all_fields(data):
    all_keys = set()
    for record in data:
        all_keys.update(record.keys())
    return sorted(all_keys)


if st.button("πŸ”„ Reset Session"):
    st.session_state.clear()  # Clear session state
    if os.path.exists(TMP_FILE):
        os.remove(TMP_FILE)  # Remove temporary file
    if os.path.exists(META_FILE):
        os.remove(META_FILE)  # Remove metadata file
    st.success("🧹 Session has been reset. Starting fresh!")
    st.rerun()  # Rerun the app to reset everything

# --- Load session data from temp file if exists ---
if "data" not in st.session_state:
    if os.path.exists(TMP_FILE):
        with open(TMP_FILE, "r", encoding="utf-8") as f:
            st.session_state.data = [json.loads(line) for line in f]
    else:
        st.session_state.data = []

    if os.path.exists(META_FILE):
        with open(META_FILE, "r", encoding="utf-8") as f:
            metadata = json.load(f)
            st.session_state.all_fields = [
                f["name"] for f in metadata.get("fields", [])
            ]
            st.session_state.field_types = {
                f["name"]: f.get("type", "text") for f in metadata.get("fields", [])
            }
    else:
        st.session_state.all_fields = get_all_fields(st.session_state.data)
        st.session_state.field_types = {
            field: "text" for field in st.session_state.all_fields
        }

# --- Upload JSONL File ---
uploaded_file = st.file_uploader("Upload a JSONL file", type=["jsonl"])

if uploaded_file:
    # Read the uploaded content
    content = uploaded_file.read().decode("utf-8")
    st.session_state.data = [json.loads(line) for line in content.strip().splitlines()]

    # Update all fields and field types
    st.session_state.all_fields = get_all_fields(st.session_state.data)

    # Update field_types based on metadata if it exists, otherwise set them to 'text'
    if "field_types" in st.session_state:
        for field in st.session_state.all_fields:
            if field not in st.session_state.field_types:
                st.session_state.field_types[field] = (
                    "textarea"  # default type for new fields
                )
    else:
        # Initialize field_types for all fields with "text" if no metadata exists
        st.session_state.field_types = {
            field: "text" for field in st.session_state.all_fields
        }

    # Save the updated data and metadata
    with open(TMP_FILE, "w", encoding="utf-8") as f:
        for item in st.session_state.data:
            f.write(json.dumps(item, ensure_ascii=False) + "\n")

    # Save metadata (field types)
    metadata = {
        "fields": [
            {"name": field, "type": st.session_state.field_types.get(field, "text")}
            for field in st.session_state.all_fields
        ]
    }
    with open(META_FILE, "w", encoding="utf-8") as f:
        json.dump(metadata, f, indent=2, ensure_ascii=False)

    # Success message
    st.success(
        f"Loaded {len(st.session_state.data)} records with fields: {st.session_state.all_fields}"
    )

# If still no data, use safe fallback fields
if not st.session_state.data and not st.session_state.all_fields:
    st.session_state.all_fields = []

# --- Edit Existing Records ---
if st.session_state.data:
    st.markdown("### ✏️ Edit Records")

    df = pd.DataFrame(st.session_state.data)
    df = df.reindex(columns=st.session_state.all_fields)

    # Convert all columns to string for safe editing
    for field in st.session_state.all_fields:
        df[field] = df[field].astype(str)

    # Ensure field_types is initialized
    if "field_types" not in st.session_state:
        st.session_state.field_types = {
            field: "text" for field in st.session_state.all_fields
        }

    # Build dynamic column config based on metadata
    column_configs = {
        field: (
            st.column_config.TextColumn(label=field, width="large")
            if st.session_state.field_types.get(field) == "textarea"
            else st.column_config.TextColumn(label=field)
        )
        for field in st.session_state.all_fields
    }

    edited_df = st.data_editor(
        df,
        use_container_width=True,
        num_rows="dynamic",
        column_config=column_configs,
    )

    if not edited_df.equals(df):
        st.session_state.data = edited_df.fillna("").to_dict(orient="records")
        with open(TMP_FILE, "w", encoding="utf-8") as f:
            for item in st.session_state.data:
                f.write(json.dumps(item, ensure_ascii=False) + "\n")
        st.toast("βœ… Auto-saved!", icon="πŸ’Ύ")
        st.rerun()

# --- Add New Entry ---
if st.session_state.all_fields:
    st.markdown("### βž• Add New Entry")

    with st.form("new_entry_form"):
        new_record = {}

        # Clear form fields if reset flag is set
        if "reset_form" in st.session_state and st.session_state.reset_form:
            # Clear the session state for each input field
            for field in st.session_state.all_fields:
                st.session_state[f"input_{field}"] = ""
            st.session_state.reset_form = (
                False  # Reset the reset flag after clearing fields
            )

        # Collect new record input from the user
        for field in st.session_state.all_fields:
            input_type = st.session_state.field_types.get(field, "text")
            if input_type == "textarea":
                new_record[field] = st.text_area(f"{field}", key=f"input_{field}")
            else:
                new_record[field] = st.text_input(f"{field}", key=f"input_{field}")

        submitted = st.form_submit_button("Add Entry")

        if submitted:
            # Append the new record to session data
            st.session_state.data.append(new_record)

            # Save the updated data to a temp file
            with open(TMP_FILE, "w", encoding="utf-8") as f:
                for item in st.session_state.data:
                    f.write(json.dumps(item, ensure_ascii=False) + "\n")

            # Set the reset flag to clear the form next time
            st.session_state.reset_form = True

            # Show success message and re-run to clear form fields
            st.success("βœ… New entry added!")
            st.rerun()

# --- Add New Field ---
with st.expander("βž• Add New Field"):
    new_field = st.text_input("Field name", key="new_field_name")
    new_type = st.selectbox("Field type", ["text", "textarea"], key="new_field_type")
    if st.button("Add Field"):
        if new_field and new_field not in st.session_state.all_fields:
            st.session_state.all_fields.append(new_field)
            st.session_state.field_types[new_field] = new_type

            fields_metadata = [
                {"name": f, "type": st.session_state.field_types[f]}
                for f in st.session_state.all_fields
            ]
            with open(META_FILE, "w", encoding="utf-8") as f:
                json.dump({"fields": fields_metadata}, f, indent=2, ensure_ascii=False)
            st.success(f"βœ… Field '{new_field}' added!")
            st.rerun()

# --- Download Dataset Button ---
st.markdown("### πŸ“€ Download Dataset")

# Read the session data as a JSONL string
dataset_content = "\n".join(
    [json.dumps(row, ensure_ascii=False) for row in st.session_state.data]
)

if os.path.exists(TMP_FILE):
    # Provide the download button
    st.download_button(
        label="⬇️ Download Dataset as JSONL",
        data=dataset_content,
        file_name="session_dataset.jsonl",
        mime="application/json",
    )
else:
    st.warning("Dataset not yet generated!")