import datetime import pandas as pd from lxml.html.defs import table_tags from lxml.html.diff import end_tag from src.utils.helpers import clean_html from src.utils.markdown_processing.md_preprocessing import convert_html_to_md from src.nlp.playground.pipelines.title_extractor import TitleExtractor from src.utils.helpers import normalize_data from src.persistence.db import * from src.utils.apis.gpt_api import remove_boilerplate # db even information schema: # information:{ # actual: { # title: # dates: [ # { # StartDate: # EndDate: # StartTime: # EndTime: # AdmittanceTime # },... # ], # location # adress:{ # street, # housenumber, # postalcode, # city # } # organizers # prices # entryFree # categories # description # # } # predicted: wie actual # } @st.cache_resource def init_connection(): return init_db() def remove_url(): result = db.event_urls.delete_one({"_id": current_element["_id"]}) st.session_state.elements = db.event_urls.find({"final":True, "class": "EventDetail"},{"_id":1, "url":1, "data":1, "html":1, "information":1}) def next(): db.event_urls.update_one({"_id": current_element["_id"]}, { "$set": { "information":event_information } }) st.session_state.index+=1 st.session_state.pop("time_ranges") def prev(): st.session_state.index-=1 st.session_state.pop("time_ranges") # Variables db = init_connection() if "index" not in st.session_state: st.session_state.index = 0 if "elements" not in st.session_state: elements = db.event_urls.find({"final":True, "class": "EventDetail"},{"_id":1, "url":1, "data":1, "html":1, "information":1}) # preprocessing of html content: get cleaned markdown for el in elements: if "data" not in el: print(el["url"]) md = convert_html_to_md(clean_html(el["html"])) try: st.info("GPT-API Anfrage läuft") gpt_md = remove_boilerplate(md) st.info("Verarbeitung beendet") el["data"] = gpt_md db.event_urls.update_one({"_id": el["_id"]}, { "$set": { 'data': el["data"] } }) except Exception as e: st.error(f"Es ist ein Fehler aufgetreten: {e} \n") db.event_urls.delete_one({"_id": el["_id"]}) st.session_state.elements = db.event_urls.find({"final":True, "class": "EventDetail"},{"_id":1, "url":1, "data":1, "html":1, "information":1}) if "predictions_on" not in st.session_state: st.session_state.predictions_on = False current_element = st.session_state.elements[st.session_state.index] predictions_on = st.toggle("Predictions an (Zeigt Extrahierte Daten an, die Seite lädt dadurch langsamer).") if predictions_on != st.session_state.predictions_on: st.session_state.predictions_on = predictions_on if current_element: if "time_ranges" not in st.session_state: st.session_state.time_ranges = current_element.get("information", {}).get("actual", {}).get("dates", []) current_url = current_element['url'] try: st.write(f"""### Aktuelle Seite: \n{current_url} """) if "data" not in current_element: md = convert_html_to_md(clean_html(current_element["html"])) try: gpt_md = remove_boilerplate(md) current_element["data"] = gpt_md db.event_urls.update_one({"_id": current_element["_id"]}, { "$set": { 'data': current_element["data"] } }) except Exception as e: st.error(f"Es ist ein Fehler aufgetreten: {e} \nDer Datenbankeintrag wird gelöscht.") db.event_urls.delete_one({"_id": current_element["_id"]}) table_data = current_element["data"] normalized_text = normalize_data(table_data) predicted_title = None predicted_date = None predicted_organizers = None if st.session_state.predictions_on: predicted_title = TitleExtractor().extract_title(normalized_text) # predicted_date = extract_entities(normalized_text, ["date", "date_range"]) # predicted_date = [ {d["text"],d["label"]} for d in predicted_date ] if predicted_date else None with st.container(border=True, height=400): st.markdown(normalized_text) with st.expander("Code ansehen"): with st.container( height=400): st.code(normalized_text) with st.expander("Veranstaltungsinformationen eingeben..."): actual_title = st.text_input("Tatsächlicher Titel eingeben:", key="title"+ str(current_element["_id"]), value=current_element.get("information", {}).get("actual", {}).get("title", None)) # Formular für neue Eingaben with st.form("time_form"): col1, col2 = st.columns(2) # Erstes Paar: Start- und Enddatum with col1: start_date = st.date_input("Startdatum", value=None, key="start_date" + str(st.session_state.index)) with col2: end_date = st.date_input("Enddatum", value=None, key="end_date" + str(st.session_state.index)) col3, col4 = st.columns(2) # Zweites Paar: Start- und Endzeit with col3: start_time = st.time_input("Startzeit", value=None, key="start_time" + str(st.session_state.index)) with col4: end_time = st.time_input("Endzeit", value=None, key="end_time" + str(st.session_state.index)) time_submitted = st.form_submit_button("Hinzufügen") # Wenn das Formular abgesendet wird if time_submitted: new_entry = { "start_date": datetime.datetime.combine(start_date,datetime.time(0)) if start_date else None, "end_date": datetime.datetime.combine(end_date,datetime.time(0)) if end_date else None, "start_time": datetime.datetime.combine(datetime.date.today(),start_time) if start_time else None, "end_time": datetime.datetime.combine(datetime.date.today(),end_time) if end_time else None, } st.session_state.time_ranges.append(new_entry) st.success("Zeitraum hinzugefügt!") input_dates = st.session_state.time_ranges actual_dates = "\n\n".join([ " ".join(filter(None, [ # Entfernt leere Strings automatisch entry.get('start_date').strftime("%Y-%m-%d") if entry.get('start_date') else '', f"- {entry.get('end_date').strftime('%Y-%m-%d')}" if entry.get('end_date') else '', entry.get('start_time').strftime("%H:%M") if entry.get('start_time') else '', f"- {entry.get('end_time').strftime('%H:%M')}" if entry.get('end_time') else '' ])) for entry in input_dates ]) input_organizers = st.text_input("Tatsächlicher Veranstalter eingeben:", key="organizer" + str(current_element["_id"]), value=",".join(current_element.get("information", {}).get("actual", {}).get("organizers", []))) actual_organizers = input_organizers.split(",") actual_location = st.text_input("Location Name", key="location" + str(current_element["_id"]), value=current_element.get("information", {}).get("actual", {}).get("location", None)) with st.form("address_form"): st.write("Adresse eingeben") col1, col2 = st.columns([3, 1]) # Spalten für Straße & Hausnummer street = col1.text_input("Straße") house_number = col2.text_input("Hausnummer") col3, col4 = st.columns([1, 3]) # Spalten für PLZ & Stadt postal_code = col3.text_input("Postleitzahl") city = col4.text_input("Stadt") address_submitted = st.form_submit_button("Speichern") address = current_element.get("information", {}).get("actual", {}).get("address", None) if address_submitted: address= { "street": street, "house_number": house_number, "postal_code": postal_code, "city": city, } actual_prices = st.text_input( "Preise", key="price" + str(current_element["_id"]), value= ";".join(current_element.get("information", {}).get("actual", {}).get("prices", []))) event_information = { "actual": { "title":actual_title, "dates":st.session_state.time_ranges, "organizers":actual_organizers, "location": actual_location, "address":address, "prices":actual_prices.split(";") if actual_prices else [], } } table_data = { "Information": [ "Titel", "Daten", "Veranstalter", "Location", "Straße", "Hausnummer", "Postleitzahl", "Stadt", "Preise" ], "Tatsächlicher Wert": [ actual_title, actual_dates, "\n\n".join(actual_organizers), actual_location if actual_location else "", address.get("street") if address else "", address.get("house_number") if address else "", address.get("postal_code") if address else "", address.get("city") if address else "", actual_prices.split(";") if actual_prices else "", ], "Predicted Wert": [ predicted_title, predicted_date, predicted_organizers, "", "", "", "", "", "" ], } df = pd.DataFrame(table_data) st.subheader("Vergleich der Titel:") st.table(df) except Exception as e: st.write(f"Fehler: {e}") st.write(current_url) col1, col2, col3, col4= st.columns([1, 1, 1, 1]) with col1: st.button("Zurück", on_click=prev, disabled=st.session_state.index<1) with col3: st.button("URL löschen", on_click=remove_url) with col4: st.button("Speichern und Weiter",on_click=next) else: st.write("Es sind aktuell keine Daten in der DB zur Bearbeitung vorhanden.")