|
import datetime |
|
|
|
import pandas as pd |
|
from lxml.html.defs import table_tags |
|
from lxml.html.diff import end_tag |
|
|
|
from src.utils.helpers import clean_html |
|
from src.utils.markdown_processing.md_preprocessing import convert_html_to_md |
|
from src.nlp.playground.pipelines.title_extractor import TitleExtractor |
|
from src.utils.helpers import normalize_data |
|
from src.persistence.db import * |
|
from src.utils.apis.gpt_api import remove_boilerplate |
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
@st.cache_resource |
|
def init_connection(): |
|
return init_db() |
|
|
|
def remove_url(): |
|
result = db.event_urls.delete_one({"_id": current_element["_id"]}) |
|
st.session_state.elements = db.event_urls.find({"final":True, "class": "EventDetail"},{"_id":1, "url":1, "data":1, "html":1, "information":1}) |
|
|
|
def next(): |
|
db.event_urls.update_one({"_id": current_element["_id"]}, { "$set": { "information":event_information } }) |
|
st.session_state.index+=1 |
|
st.session_state.pop("time_ranges") |
|
|
|
def prev(): |
|
st.session_state.index-=1 |
|
st.session_state.pop("time_ranges") |
|
|
|
|
|
db = init_connection() |
|
if "index" not in st.session_state: |
|
st.session_state.index = 0 |
|
if "elements" not in st.session_state: |
|
elements = db.event_urls.find({"final":True, "class": "EventDetail"},{"_id":1, "url":1, "data":1, "html":1, "information":1}) |
|
|
|
|
|
for el in elements: |
|
if "data" not in el: |
|
print(el["url"]) |
|
md = convert_html_to_md(clean_html(el["html"])) |
|
try: |
|
st.info("GPT-API Anfrage läuft") |
|
gpt_md = remove_boilerplate(md) |
|
st.info("Verarbeitung beendet") |
|
el["data"] = gpt_md |
|
db.event_urls.update_one({"_id": el["_id"]}, { "$set": { 'data': el["data"] } }) |
|
except Exception as e: |
|
st.error(f"Es ist ein Fehler aufgetreten: {e} \n") |
|
db.event_urls.delete_one({"_id": el["_id"]}) |
|
st.session_state.elements = db.event_urls.find({"final":True, "class": "EventDetail"},{"_id":1, "url":1, "data":1, "html":1, "information":1}) |
|
if "predictions_on" not in st.session_state: |
|
st.session_state.predictions_on = False |
|
|
|
|
|
|
|
current_element = st.session_state.elements[st.session_state.index] |
|
|
|
predictions_on = st.toggle("Predictions an (Zeigt Extrahierte Daten an, die Seite lädt dadurch langsamer).") |
|
if predictions_on != st.session_state.predictions_on: |
|
st.session_state.predictions_on = predictions_on |
|
|
|
if current_element: |
|
if "time_ranges" not in st.session_state: |
|
st.session_state.time_ranges = current_element.get("information", {}).get("actual", {}).get("dates", []) |
|
current_url = current_element['url'] |
|
|
|
try: |
|
st.write(f"""### Aktuelle Seite: \n{current_url} """) |
|
if "data" not in current_element: |
|
md = convert_html_to_md(clean_html(current_element["html"])) |
|
try: |
|
gpt_md = remove_boilerplate(md) |
|
current_element["data"] = gpt_md |
|
db.event_urls.update_one({"_id": current_element["_id"]}, { "$set": { 'data': current_element["data"] } }) |
|
except Exception as e: |
|
st.error(f"Es ist ein Fehler aufgetreten: {e} \nDer Datenbankeintrag wird gelöscht.") |
|
db.event_urls.delete_one({"_id": current_element["_id"]}) |
|
table_data = current_element["data"] |
|
normalized_text = normalize_data(table_data) |
|
predicted_title = None |
|
predicted_date = None |
|
predicted_organizers = None |
|
if st.session_state.predictions_on: |
|
predicted_title = TitleExtractor().extract_title(normalized_text) |
|
|
|
|
|
with st.container(border=True, height=400): |
|
st.markdown(normalized_text) |
|
with st.expander("Code ansehen"): |
|
with st.container( height=400): |
|
st.code(normalized_text) |
|
|
|
with st.expander("Veranstaltungsinformationen eingeben..."): |
|
actual_title = st.text_input("Tatsächlicher Titel eingeben:", key="title"+ str(current_element["_id"]), |
|
value=current_element.get("information", {}).get("actual", {}).get("title", None)) |
|
|
|
|
|
with st.form("time_form"): |
|
col1, col2 = st.columns(2) |
|
with col1: |
|
start_date = st.date_input("Startdatum", value=None, key="start_date" + str(st.session_state.index)) |
|
with col2: |
|
end_date = st.date_input("Enddatum", value=None, key="end_date" + str(st.session_state.index)) |
|
|
|
col3, col4 = st.columns(2) |
|
with col3: |
|
start_time = st.time_input("Startzeit", value=None, key="start_time" + str(st.session_state.index)) |
|
with col4: |
|
end_time = st.time_input("Endzeit", value=None, key="end_time" + str(st.session_state.index)) |
|
|
|
time_submitted = st.form_submit_button("Hinzufügen") |
|
|
|
|
|
if time_submitted: |
|
new_entry = { |
|
"start_date": datetime.datetime.combine(start_date,datetime.time(0)) if start_date else None, |
|
"end_date": datetime.datetime.combine(end_date,datetime.time(0)) if end_date else None, |
|
"start_time": datetime.datetime.combine(datetime.date.today(),start_time) if start_time else None, |
|
"end_time": datetime.datetime.combine(datetime.date.today(),end_time) if end_time else None, |
|
} |
|
st.session_state.time_ranges.append(new_entry) |
|
st.success("Zeitraum hinzugefügt!") |
|
input_dates = st.session_state.time_ranges |
|
actual_dates = "\n\n".join([ |
|
" ".join(filter(None, [ |
|
entry.get('start_date').strftime("%Y-%m-%d") if entry.get('start_date') else '', |
|
f"- {entry.get('end_date').strftime('%Y-%m-%d')}" if entry.get('end_date') else '', |
|
entry.get('start_time').strftime("%H:%M") if entry.get('start_time') else '', |
|
f"- {entry.get('end_time').strftime('%H:%M')}" if entry.get('end_time') else '' |
|
])) |
|
for entry in input_dates |
|
]) |
|
|
|
input_organizers = st.text_input("Tatsächlicher Veranstalter eingeben:", key="organizer" + str(current_element["_id"]), |
|
value=",".join(current_element.get("information", {}).get("actual", {}).get("organizers", []))) |
|
actual_organizers = input_organizers.split(",") |
|
|
|
actual_location = st.text_input("Location Name", key="location" + str(current_element["_id"]), |
|
value=current_element.get("information", {}).get("actual", {}).get("location", None)) |
|
with st.form("address_form"): |
|
st.write("Adresse eingeben") |
|
|
|
col1, col2 = st.columns([3, 1]) |
|
street = col1.text_input("Straße") |
|
house_number = col2.text_input("Hausnummer") |
|
|
|
col3, col4 = st.columns([1, 3]) |
|
postal_code = col3.text_input("Postleitzahl") |
|
city = col4.text_input("Stadt") |
|
|
|
address_submitted = st.form_submit_button("Speichern") |
|
address = current_element.get("information", {}).get("actual", {}).get("address", None) |
|
if address_submitted: |
|
address= { |
|
"street": street, |
|
"house_number": house_number, |
|
"postal_code": postal_code, |
|
"city": city, |
|
} |
|
|
|
actual_prices = st.text_input( |
|
"Preise", |
|
key="price" + str(current_element["_id"]), |
|
value= ";".join(current_element.get("information", {}).get("actual", {}).get("prices", []))) |
|
|
|
event_information = { |
|
"actual": |
|
{ |
|
"title":actual_title, |
|
"dates":st.session_state.time_ranges, |
|
"organizers":actual_organizers, |
|
"location": actual_location, |
|
"address":address, |
|
"prices":actual_prices.split(";") if actual_prices else [], |
|
} |
|
} |
|
table_data = { |
|
"Information": [ |
|
"Titel", |
|
"Daten", |
|
"Veranstalter", |
|
"Location", |
|
"Straße", |
|
"Hausnummer", |
|
"Postleitzahl", |
|
"Stadt", |
|
"Preise" |
|
], |
|
"Tatsächlicher Wert": |
|
[ |
|
actual_title, |
|
actual_dates, |
|
"\n\n".join(actual_organizers), |
|
actual_location if actual_location else "", |
|
address.get("street") if address else "", |
|
address.get("house_number") if address else "", |
|
address.get("postal_code") if address else "", |
|
address.get("city") if address else "", |
|
actual_prices.split(";") if actual_prices else "", |
|
], |
|
"Predicted Wert": [ |
|
predicted_title, |
|
predicted_date, |
|
predicted_organizers, |
|
"", |
|
"", |
|
"", |
|
"", |
|
"", |
|
"" |
|
], |
|
} |
|
df = pd.DataFrame(table_data) |
|
|
|
st.subheader("Vergleich der Titel:") |
|
st.table(df) |
|
|
|
except Exception as e: |
|
st.write(f"Fehler: {e}") |
|
st.write(current_url) |
|
col1, col2, col3, col4= st.columns([1, 1, 1, 1]) |
|
|
|
|
|
with col1: |
|
st.button("Zurück", on_click=prev, disabled=st.session_state.index<1) |
|
with col3: |
|
st.button("URL löschen", on_click=remove_url) |
|
with col4: |
|
st.button("Speichern und Weiter",on_click=next) |
|
|
|
else: |
|
st.write("Es sind aktuell keine Daten in der DB zur Bearbeitung vorhanden.") |
|
|
|
|
|
|
|
|