File size: 11,068 Bytes
4eb9d09
 
da88570
4eb9d09
 
 
da88570
 
 
 
 
 
4eb9d09
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
da88570
 
 
 
 
 
 
 
 
 
 
 
 
4eb9d09
da88570
 
 
4eb9d09
da88570
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
4eb9d09
da88570
 
 
 
 
 
 
4eb9d09
 
da88570
 
 
 
 
 
 
 
 
 
 
 
 
4eb9d09
 
da88570
 
4eb9d09
da88570
 
 
 
 
 
 
 
 
 
4eb9d09
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
da88570
 
4eb9d09
 
 
 
 
 
 
 
da88570
 
 
 
4eb9d09
 
 
 
 
 
 
 
da88570
 
 
4eb9d09
 
 
 
 
 
 
 
da88570
 
4eb9d09
da88570
 
 
 
 
 
 
 
 
 
 
4eb9d09
da88570
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
194
195
196
197
198
199
200
201
202
203
204
205
206
207
208
209
210
211
212
213
214
215
216
217
218
219
220
221
222
223
224
225
226
227
228
229
230
231
232
233
234
235
236
237
238
239
240
241
242
243
244
245
246
247
248
249
250
251
252
253
254
255
256
257
258
259
260
261
262
263
264
265
266
267
268
269
import datetime

import pandas as pd
from lxml.html.defs import table_tags
from lxml.html.diff import end_tag

from src.utils.helpers import clean_html
from src.utils.markdown_processing.md_preprocessing import convert_html_to_md
from src.nlp.playground.pipelines.title_extractor import TitleExtractor
from src.utils.helpers import normalize_data
from src.persistence.db import *
from src.utils.apis.gpt_api import remove_boilerplate

# db even information schema:
# information:{
#     actual: {
#         title:
#         dates: [
#             {
#                 StartDate:
#                 EndDate:
#                 StartTime:
#                 EndTime:
#                 AdmittanceTime
#             },...
#         ],
#         location
#         adress:{
#             street,
#             housenumber,
#             postalcode,
#             city
#         }
#         organizers
#         prices
#         entryFree
#         categories
#         description
#
#     }
#     predicted: wie actual
# }


@st.cache_resource
def init_connection():
    return init_db()

def remove_url():
    result = db.event_urls.delete_one({"_id": current_element["_id"]})
    st.session_state.elements = db.event_urls.find({"final":True, "class": "EventDetail"},{"_id":1, "url":1, "data":1, "html":1, "information":1})

def next():
    db.event_urls.update_one({"_id": current_element["_id"]}, { "$set": { "information":event_information } })
    st.session_state.index+=1
    st.session_state.pop("time_ranges")

def prev():
    st.session_state.index-=1
    st.session_state.pop("time_ranges")

# Variables
db = init_connection()
if "index" not in st.session_state:
    st.session_state.index = 0
if "elements" not in st.session_state:
    elements = db.event_urls.find({"final":True, "class": "EventDetail"},{"_id":1, "url":1, "data":1, "html":1, "information":1})

    # preprocessing of html content: get cleaned markdown
    for el in elements:
        if "data" not in el:
            print(el["url"])
            md = convert_html_to_md(clean_html(el["html"]))
            try:
                st.info("GPT-API Anfrage läuft")
                gpt_md = remove_boilerplate(md)
                st.info("Verarbeitung beendet")
                el["data"] = gpt_md
                db.event_urls.update_one({"_id": el["_id"]}, { "$set": { 'data': el["data"] } })
            except Exception as e:
                st.error(f"Es ist ein Fehler aufgetreten: {e} \n")
                db.event_urls.delete_one({"_id": el["_id"]})
    st.session_state.elements = db.event_urls.find({"final":True, "class": "EventDetail"},{"_id":1, "url":1, "data":1, "html":1, "information":1})
if "predictions_on" not in st.session_state:
    st.session_state.predictions_on = False



current_element = st.session_state.elements[st.session_state.index]

predictions_on = st.toggle("Predictions an (Zeigt Extrahierte Daten an, die Seite lädt dadurch langsamer).")
if predictions_on != st.session_state.predictions_on:
    st.session_state.predictions_on = predictions_on

if current_element:
    if "time_ranges" not in st.session_state:
        st.session_state.time_ranges = current_element.get("information", {}).get("actual", {}).get("dates", [])
    current_url = current_element['url']

    try:
        st.write(f"""### Aktuelle Seite: \n{current_url} """)
        if "data" not in current_element:
            md = convert_html_to_md(clean_html(current_element["html"]))
            try:
                gpt_md = remove_boilerplate(md)
                current_element["data"] = gpt_md
                db.event_urls.update_one({"_id": current_element["_id"]}, { "$set": { 'data': current_element["data"] } })
            except Exception as e:
                st.error(f"Es ist ein Fehler aufgetreten: {e} \nDer Datenbankeintrag wird gelöscht.")
                db.event_urls.delete_one({"_id": current_element["_id"]})
        table_data = current_element["data"]
        normalized_text = normalize_data(table_data)
        predicted_title = None
        predicted_date = None
        predicted_organizers = None
        if st.session_state.predictions_on:
            predicted_title = TitleExtractor().extract_title(normalized_text)
            # predicted_date = extract_entities(normalized_text, ["date", "date_range"])
            # predicted_date = [ {d["text"],d["label"]} for d in predicted_date ] if predicted_date else None
        with st.container(border=True, height=400):
            st.markdown(normalized_text)
        with st.expander("Code ansehen"):
            with st.container( height=400):
                st.code(normalized_text)

        with st.expander("Veranstaltungsinformationen eingeben..."):
            actual_title = st.text_input("Tatsächlicher Titel eingeben:", key="title"+ str(current_element["_id"]),
                                         value=current_element.get("information", {}).get("actual", {}).get("title", None))
            # Formular für neue Eingaben

            with st.form("time_form"):
                col1, col2 = st.columns(2)  # Erstes Paar: Start- und Enddatum
                with col1:
                    start_date = st.date_input("Startdatum", value=None, key="start_date" + str(st.session_state.index))
                with col2:
                    end_date = st.date_input("Enddatum", value=None, key="end_date" + str(st.session_state.index))

                col3, col4 = st.columns(2)  # Zweites Paar: Start- und Endzeit
                with col3:
                    start_time = st.time_input("Startzeit", value=None, key="start_time" + str(st.session_state.index))
                with col4:
                    end_time = st.time_input("Endzeit", value=None, key="end_time" + str(st.session_state.index))

                time_submitted = st.form_submit_button("Hinzufügen")

            # Wenn das Formular abgesendet wird
            if time_submitted:
                new_entry = {
                    "start_date": datetime.datetime.combine(start_date,datetime.time(0)) if start_date else None,
                    "end_date": datetime.datetime.combine(end_date,datetime.time(0)) if end_date else None,
                    "start_time": datetime.datetime.combine(datetime.date.today(),start_time) if start_time else None,
                    "end_time": datetime.datetime.combine(datetime.date.today(),end_time) if end_time else None,
                }
                st.session_state.time_ranges.append(new_entry)
                st.success("Zeitraum hinzugefügt!")
            input_dates = st.session_state.time_ranges
            actual_dates = "\n\n".join([
                " ".join(filter(None, [  # Entfernt leere Strings automatisch
                    entry.get('start_date').strftime("%Y-%m-%d") if entry.get('start_date') else '',
                    f"- {entry.get('end_date').strftime('%Y-%m-%d')}" if entry.get('end_date') else '',
                    entry.get('start_time').strftime("%H:%M") if entry.get('start_time') else '',
                    f"- {entry.get('end_time').strftime('%H:%M')}" if entry.get('end_time') else ''
                ]))
                for entry in input_dates
            ])

            input_organizers = st.text_input("Tatsächlicher Veranstalter eingeben:", key="organizer" + str(current_element["_id"]),
                                         value=",".join(current_element.get("information", {}).get("actual", {}).get("organizers", [])))
            actual_organizers = input_organizers.split(",")

            actual_location = st.text_input("Location Name", key="location" + str(current_element["_id"]),
                                         value=current_element.get("information", {}).get("actual", {}).get("location", None))
            with st.form("address_form"):
                st.write("Adresse eingeben")

                col1, col2 = st.columns([3, 1])  # Spalten für Straße & Hausnummer
                street = col1.text_input("Straße")
                house_number = col2.text_input("Hausnummer")

                col3, col4 = st.columns([1, 3])  # Spalten für PLZ & Stadt
                postal_code = col3.text_input("Postleitzahl")
                city = col4.text_input("Stadt")

                address_submitted = st.form_submit_button("Speichern")
            address = current_element.get("information", {}).get("actual", {}).get("address", None)
            if address_submitted:
                address= {
                    "street": street,
                    "house_number": house_number,
                    "postal_code": postal_code,
                    "city": city,
                }

            actual_prices = st.text_input(
                "Preise",
                key="price" + str(current_element["_id"]),
                value= ";".join(current_element.get("information", {}).get("actual", {}).get("prices", [])))

        event_information = {
            "actual":
                {
                    "title":actual_title,
                    "dates":st.session_state.time_ranges,
                    "organizers":actual_organizers,
                    "location": actual_location,
                    "address":address,
                    "prices":actual_prices.split(";") if actual_prices else [],
                }
        }
        table_data = {
            "Information": [
                "Titel",
                "Daten",
                "Veranstalter",
                "Location",
                "Straße",
                "Hausnummer",
                "Postleitzahl",
                "Stadt",
                "Preise"
            ],
            "Tatsächlicher Wert":
                [
                    actual_title,
                    actual_dates,
                    "\n\n".join(actual_organizers),
                    actual_location if actual_location else "",
                    address.get("street") if address else "",
                    address.get("house_number") if address else "",
                    address.get("postal_code") if address else "",
                    address.get("city") if address else "",
                    actual_prices.split(";") if actual_prices else "",
                ],
            "Predicted Wert": [
                predicted_title,
                predicted_date,
                predicted_organizers,
                "",
                "",
                "",
                "",
                "",
                ""
            ],
        }
        df = pd.DataFrame(table_data)

        st.subheader("Vergleich der Titel:")
        st.table(df)

    except Exception as e:
        st.write(f"Fehler: {e}")
        st.write(current_url)
    col1, col2, col3, col4= st.columns([1, 1, 1, 1])


    with col1:
        st.button("Zurück", on_click=prev, disabled=st.session_state.index<1)
    with col3:
        st.button("URL löschen", on_click=remove_url)
    with col4:
        st.button("Speichern und Weiter",on_click=next)

else:
    st.write("Es sind aktuell keine Daten in der DB zur Bearbeitung vorhanden.")