import streamlit as st import fitz # PyMuPDF for PDF processing import pandas as pd import os from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline # Get the Hugging Face token from the environment variables hf_token = os.getenv("HF_API_TOKEN") # Load the model (Meta-Llama 3.1 8B) @st.cache_resource def load_model(): model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", use_auth_token=hf_token) return model model = load_model() # Function to extract text from PDF def extract_pdf_text(file): doc = fitz.open(stream=file.read(), filetype="pdf") extracted_text = "" for page in doc: extracted_text += page.get_text("text") return extracted_text # Function to chunk text into smaller sections def chunk_text(text, max_tokens=1000): sentences = text.split('.') chunks = [] current_chunk = "" current_token_count = 0 for sentence in sentences: token_count = len(sentence.split()) if current_token_count + token_count > max_tokens: chunks.append(current_chunk.strip()) current_chunk = sentence current_token_count = token_count else: current_chunk += sentence + "." current_token_count += token_count if current_chunk: chunks.append(current_chunk.strip()) return chunks # Prompt generation for extracting financial data def generate_extraction_prompt(chunk): return f""" From the following text, please extract the following financial metrics in IFRS format: - Revenue - Net Income - Total Assets - Total Liabilities - Shareholders' Equity - Current Assets - Current Liabilities If the information is not found in the text, return 'Not Available'. Text: {chunk} """ # Function to query Meta-Llama for each chunk def extract_financial_metrics_from_chunk(chunk): prompt = generate_extraction_prompt(chunk) model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", use_auth_token=hf_token) tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", use_auth_token=hf_token) nlp = pipeline("text-generation", model=model, tokenizer=tokenizer) response = nlp(prompt) return response[0]['generated_text'] # Process the PDF text through the model def process_pdf_text_for_metrics(text): chunks = chunk_text(text) extracted_metrics = [] for chunk in chunks: response = extract_financial_metrics_from_chunk(chunk) extracted_metrics.append(response) return extracted_metrics # Function to parse the metrics from the model response import re def parse_metrics(extracted_text): metrics = {} for line in extracted_text.split("\n"): if "Revenue" in line: metrics['Revenue'] = re.findall(r'\d+', line) # Find numeric data elif "Net Income" in line: metrics['Net Income'] = re.findall(r'\d+', line) elif "Total Assets" in line: metrics['Total Assets'] = re.findall(r'\d+', line) elif "Total Liabilities" in line: metrics['Total Liabilities'] = re.findall(r'\d+', line) elif "Shareholders' Equity" in line: metrics['Shareholders\' Equity'] = re.findall(r'\d+', line) elif "Current Assets" in line: metrics['Current Assets'] = re.findall(r'\d+', line) elif "Current Liabilities" in line: metrics['Current Liabilities'] = re.findall(r'\d+', line) return metrics # Function to aggregate metrics from all chunks def aggregate_metrics(extracted_metrics): aggregated_metrics = { "Revenue": None, "Net Income": None, "Total Assets": None, "Total Liabilities": None, "Shareholders' Equity": None, "Current Assets": None, "Current Liabilities": None } for metrics_text in extracted_metrics: parsed = parse_metrics(metrics_text) for key in parsed: if not aggregated_metrics[key]: aggregated_metrics[key] = parsed[key] return aggregated_metrics # Function to calculate financial ratios def calculate_financial_ratios(metrics): try: current_ratio = int(metrics['Current Assets'][0]) / int(metrics['Current Liabilities'][0]) debt_to_equity = int(metrics['Total Liabilities'][0]) / int(metrics['Shareholders\' Equity'][0]) roa = int(metrics['Net Income'][0]) / int(metrics['Total Assets'][0]) roe = int(metrics['Net Income'][0]) / int(metrics['Shareholders\' Equity'][0]) return { 'Current Ratio': current_ratio, 'Debt to Equity': debt_to_equity, 'Return on Assets (ROA)': roa, 'Return on Equity (ROE)': roe } except (TypeError, KeyError, IndexError): return "Some metrics were not extracted properly or are missing." # Streamlit UI st.title("Financial Ratio Extractor from IFRS Reports") st.write(""" Upload an IFRS financial report (PDF), and this app will automatically extract key financial metrics such as Revenue, Net Income, Total Assets, and calculate important financial ratios like ROA, ROE, and Debt-to-Equity Ratio. You can also ask questions about the financial data using Meta-Llama. """) # File uploader for PDF uploaded_file = st.file_uploader("Upload your IFRS report (PDF)", type=["pdf"]) # If a PDF is uploaded if uploaded_file: st.write("Processing your document, please wait...") # Extract text from PDF pdf_text = extract_pdf_text(uploaded_file) # Process the text through Meta-Llama for metrics extraction extracted_metrics = process_pdf_text_for_metrics(pdf_text) # Aggregate extracted metrics aggregated_metrics = aggregate_metrics(extracted_metrics) # Calculate financial ratios financial_ratios = calculate_financial_ratios(aggregated_metrics) # Display extracted financial ratios st.subheader("Extracted Financial Ratios:") if isinstance(financial_ratios, dict): st.table(pd.DataFrame(financial_ratios.items(), columns=["Ratio", "Value"])) else: st.write(financial_ratios) # Asking questions to Meta-Llama st.subheader("Ask Meta-Llama about the extracted financial data:") question = st.text_input("Enter your question here") if st.button("Ask Meta-Llama"): if question: response = model(question) st.write("Meta-Llama's Response:") st.write(response[0]['generated_text'])