manaviel85370
add input fields for actual information
4eb9d09
import datetime
import pandas as pd
from lxml.html.defs import table_tags
from lxml.html.diff import end_tag
from src.utils.helpers import clean_html
from src.utils.markdown_processing.md_preprocessing import convert_html_to_md
from src.nlp.playground.pipelines.title_extractor import TitleExtractor
from src.utils.helpers import normalize_data
from src.persistence.db import *
from src.utils.apis.gpt_api import remove_boilerplate
# db even information schema:
# information:{
# actual: {
# title:
# dates: [
# {
# StartDate:
# EndDate:
# StartTime:
# EndTime:
# AdmittanceTime
# },...
# ],
# location
# adress:{
# street,
# housenumber,
# postalcode,
# city
# }
# organizers
# prices
# entryFree
# categories
# description
#
# }
# predicted: wie actual
# }
@st.cache_resource
def init_connection():
return init_db()
def remove_url():
result = db.event_urls.delete_one({"_id": current_element["_id"]})
st.session_state.elements = db.event_urls.find({"final":True, "class": "EventDetail"},{"_id":1, "url":1, "data":1, "html":1, "information":1})
def next():
db.event_urls.update_one({"_id": current_element["_id"]}, { "$set": { "information":event_information } })
st.session_state.index+=1
st.session_state.pop("time_ranges")
def prev():
st.session_state.index-=1
st.session_state.pop("time_ranges")
# Variables
db = init_connection()
if "index" not in st.session_state:
st.session_state.index = 0
if "elements" not in st.session_state:
elements = db.event_urls.find({"final":True, "class": "EventDetail"},{"_id":1, "url":1, "data":1, "html":1, "information":1})
# preprocessing of html content: get cleaned markdown
for el in elements:
if "data" not in el:
print(el["url"])
md = convert_html_to_md(clean_html(el["html"]))
try:
st.info("GPT-API Anfrage läuft")
gpt_md = remove_boilerplate(md)
st.info("Verarbeitung beendet")
el["data"] = gpt_md
db.event_urls.update_one({"_id": el["_id"]}, { "$set": { 'data': el["data"] } })
except Exception as e:
st.error(f"Es ist ein Fehler aufgetreten: {e} \n")
db.event_urls.delete_one({"_id": el["_id"]})
st.session_state.elements = db.event_urls.find({"final":True, "class": "EventDetail"},{"_id":1, "url":1, "data":1, "html":1, "information":1})
if "predictions_on" not in st.session_state:
st.session_state.predictions_on = False
current_element = st.session_state.elements[st.session_state.index]
predictions_on = st.toggle("Predictions an (Zeigt Extrahierte Daten an, die Seite lädt dadurch langsamer).")
if predictions_on != st.session_state.predictions_on:
st.session_state.predictions_on = predictions_on
if current_element:
if "time_ranges" not in st.session_state:
st.session_state.time_ranges = current_element.get("information", {}).get("actual", {}).get("dates", [])
current_url = current_element['url']
try:
st.write(f"""### Aktuelle Seite: \n{current_url} """)
if "data" not in current_element:
md = convert_html_to_md(clean_html(current_element["html"]))
try:
gpt_md = remove_boilerplate(md)
current_element["data"] = gpt_md
db.event_urls.update_one({"_id": current_element["_id"]}, { "$set": { 'data': current_element["data"] } })
except Exception as e:
st.error(f"Es ist ein Fehler aufgetreten: {e} \nDer Datenbankeintrag wird gelöscht.")
db.event_urls.delete_one({"_id": current_element["_id"]})
table_data = current_element["data"]
normalized_text = normalize_data(table_data)
predicted_title = None
predicted_date = None
predicted_organizers = None
if st.session_state.predictions_on:
predicted_title = TitleExtractor().extract_title(normalized_text)
# predicted_date = extract_entities(normalized_text, ["date", "date_range"])
# predicted_date = [ {d["text"],d["label"]} for d in predicted_date ] if predicted_date else None
with st.container(border=True, height=400):
st.markdown(normalized_text)
with st.expander("Code ansehen"):
with st.container( height=400):
st.code(normalized_text)
with st.expander("Veranstaltungsinformationen eingeben..."):
actual_title = st.text_input("Tatsächlicher Titel eingeben:", key="title"+ str(current_element["_id"]),
value=current_element.get("information", {}).get("actual", {}).get("title", None))
# Formular für neue Eingaben
with st.form("time_form"):
col1, col2 = st.columns(2) # Erstes Paar: Start- und Enddatum
with col1:
start_date = st.date_input("Startdatum", value=None, key="start_date" + str(st.session_state.index))
with col2:
end_date = st.date_input("Enddatum", value=None, key="end_date" + str(st.session_state.index))
col3, col4 = st.columns(2) # Zweites Paar: Start- und Endzeit
with col3:
start_time = st.time_input("Startzeit", value=None, key="start_time" + str(st.session_state.index))
with col4:
end_time = st.time_input("Endzeit", value=None, key="end_time" + str(st.session_state.index))
time_submitted = st.form_submit_button("Hinzufügen")
# Wenn das Formular abgesendet wird
if time_submitted:
new_entry = {
"start_date": datetime.datetime.combine(start_date,datetime.time(0)) if start_date else None,
"end_date": datetime.datetime.combine(end_date,datetime.time(0)) if end_date else None,
"start_time": datetime.datetime.combine(datetime.date.today(),start_time) if start_time else None,
"end_time": datetime.datetime.combine(datetime.date.today(),end_time) if end_time else None,
}
st.session_state.time_ranges.append(new_entry)
st.success("Zeitraum hinzugefügt!")
input_dates = st.session_state.time_ranges
actual_dates = "\n\n".join([
" ".join(filter(None, [ # Entfernt leere Strings automatisch
entry.get('start_date').strftime("%Y-%m-%d") if entry.get('start_date') else '',
f"- {entry.get('end_date').strftime('%Y-%m-%d')}" if entry.get('end_date') else '',
entry.get('start_time').strftime("%H:%M") if entry.get('start_time') else '',
f"- {entry.get('end_time').strftime('%H:%M')}" if entry.get('end_time') else ''
]))
for entry in input_dates
])
input_organizers = st.text_input("Tatsächlicher Veranstalter eingeben:", key="organizer" + str(current_element["_id"]),
value=",".join(current_element.get("information", {}).get("actual", {}).get("organizers", [])))
actual_organizers = input_organizers.split(",")
actual_location = st.text_input("Location Name", key="location" + str(current_element["_id"]),
value=current_element.get("information", {}).get("actual", {}).get("location", None))
with st.form("address_form"):
st.write("Adresse eingeben")
col1, col2 = st.columns([3, 1]) # Spalten für Straße & Hausnummer
street = col1.text_input("Straße")
house_number = col2.text_input("Hausnummer")
col3, col4 = st.columns([1, 3]) # Spalten für PLZ & Stadt
postal_code = col3.text_input("Postleitzahl")
city = col4.text_input("Stadt")
address_submitted = st.form_submit_button("Speichern")
address = current_element.get("information", {}).get("actual", {}).get("address", None)
if address_submitted:
address= {
"street": street,
"house_number": house_number,
"postal_code": postal_code,
"city": city,
}
actual_prices = st.text_input(
"Preise",
key="price" + str(current_element["_id"]),
value= ";".join(current_element.get("information", {}).get("actual", {}).get("prices", [])))
event_information = {
"actual":
{
"title":actual_title,
"dates":st.session_state.time_ranges,
"organizers":actual_organizers,
"location": actual_location,
"address":address,
"prices":actual_prices.split(";") if actual_prices else [],
}
}
table_data = {
"Information": [
"Titel",
"Daten",
"Veranstalter",
"Location",
"Straße",
"Hausnummer",
"Postleitzahl",
"Stadt",
"Preise"
],
"Tatsächlicher Wert":
[
actual_title,
actual_dates,
"\n\n".join(actual_organizers),
actual_location if actual_location else "",
address.get("street") if address else "",
address.get("house_number") if address else "",
address.get("postal_code") if address else "",
address.get("city") if address else "",
actual_prices.split(";") if actual_prices else "",
],
"Predicted Wert": [
predicted_title,
predicted_date,
predicted_organizers,
"",
"",
"",
"",
"",
""
],
}
df = pd.DataFrame(table_data)
st.subheader("Vergleich der Titel:")
st.table(df)
except Exception as e:
st.write(f"Fehler: {e}")
st.write(current_url)
col1, col2, col3, col4= st.columns([1, 1, 1, 1])
with col1:
st.button("Zurück", on_click=prev, disabled=st.session_state.index<1)
with col3:
st.button("URL löschen", on_click=remove_url)
with col4:
st.button("Speichern und Weiter",on_click=next)
else:
st.write("Es sind aktuell keine Daten in der DB zur Bearbeitung vorhanden.")