Spaces:
Sleeping
Sleeping
File size: 6,766 Bytes
6567c4a 27a89df 3d82a40 6567c4a 27a89df d9de2eb 3d82a40 6567c4a 3d82a40 6567c4a 3d82a40 6567c4a |
1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20 21 22 23 24 25 26 27 28 29 30 31 32 33 34 35 36 37 38 39 40 41 42 43 44 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 60 61 62 63 64 65 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104 105 106 107 108 109 110 111 112 113 114 115 116 117 118 119 120 121 122 123 124 125 126 127 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 188 189 190 191 192 193 |
import streamlit as st
import fitz # PyMuPDF for PDF processing
import pandas as pd
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline
# Get the Hugging Face token from the environment variables
hf_token = os.getenv("HF_API_TOKEN")
# Load the model (Meta-Llama 3.1 8B)
@st.cache_resource
def load_model():
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", use_auth_token=hf_token)
return model
model = load_model()
# Function to extract text from PDF
def extract_pdf_text(file):
doc = fitz.open(stream=file.read(), filetype="pdf")
extracted_text = ""
for page in doc:
extracted_text += page.get_text("text")
return extracted_text
# Function to chunk text into smaller sections
def chunk_text(text, max_tokens=1000):
sentences = text.split('.')
chunks = []
current_chunk = ""
current_token_count = 0
for sentence in sentences:
token_count = len(sentence.split())
if current_token_count + token_count > max_tokens:
chunks.append(current_chunk.strip())
current_chunk = sentence
current_token_count = token_count
else:
current_chunk += sentence + "."
current_token_count += token_count
if current_chunk:
chunks.append(current_chunk.strip())
return chunks
# Prompt generation for extracting financial data
def generate_extraction_prompt(chunk):
return f"""
From the following text, please extract the following financial metrics in IFRS format:
- Revenue
- Net Income
- Total Assets
- Total Liabilities
- Shareholders' Equity
- Current Assets
- Current Liabilities
If the information is not found in the text, return 'Not Available'.
Text: {chunk}
"""
# Function to query Meta-Llama for each chunk
def extract_financial_metrics_from_chunk(chunk):
prompt = generate_extraction_prompt(chunk)
model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", use_auth_token=hf_token)
tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", use_auth_token=hf_token)
nlp = pipeline("text-generation", model=model, tokenizer=tokenizer)
response = nlp(prompt)
return response[0]['generated_text']
# Process the PDF text through the model
def process_pdf_text_for_metrics(text):
chunks = chunk_text(text)
extracted_metrics = []
for chunk in chunks:
response = extract_financial_metrics_from_chunk(chunk)
extracted_metrics.append(response)
return extracted_metrics
# Function to parse the metrics from the model response
import re
def parse_metrics(extracted_text):
metrics = {}
for line in extracted_text.split("\n"):
if "Revenue" in line:
metrics['Revenue'] = re.findall(r'\d+', line) # Find numeric data
elif "Net Income" in line:
metrics['Net Income'] = re.findall(r'\d+', line)
elif "Total Assets" in line:
metrics['Total Assets'] = re.findall(r'\d+', line)
elif "Total Liabilities" in line:
metrics['Total Liabilities'] = re.findall(r'\d+', line)
elif "Shareholders' Equity" in line:
metrics['Shareholders\' Equity'] = re.findall(r'\d+', line)
elif "Current Assets" in line:
metrics['Current Assets'] = re.findall(r'\d+', line)
elif "Current Liabilities" in line:
metrics['Current Liabilities'] = re.findall(r'\d+', line)
return metrics
# Function to aggregate metrics from all chunks
def aggregate_metrics(extracted_metrics):
aggregated_metrics = {
"Revenue": None,
"Net Income": None,
"Total Assets": None,
"Total Liabilities": None,
"Shareholders' Equity": None,
"Current Assets": None,
"Current Liabilities": None
}
for metrics_text in extracted_metrics:
parsed = parse_metrics(metrics_text)
for key in parsed:
if not aggregated_metrics[key]:
aggregated_metrics[key] = parsed[key]
return aggregated_metrics
# Function to calculate financial ratios
def calculate_financial_ratios(metrics):
try:
current_ratio = int(metrics['Current Assets'][0]) / int(metrics['Current Liabilities'][0])
debt_to_equity = int(metrics['Total Liabilities'][0]) / int(metrics['Shareholders\' Equity'][0])
roa = int(metrics['Net Income'][0]) / int(metrics['Total Assets'][0])
roe = int(metrics['Net Income'][0]) / int(metrics['Shareholders\' Equity'][0])
return {
'Current Ratio': current_ratio,
'Debt to Equity': debt_to_equity,
'Return on Assets (ROA)': roa,
'Return on Equity (ROE)': roe
}
except (TypeError, KeyError, IndexError):
return "Some metrics were not extracted properly or are missing."
# Streamlit UI
st.title("Financial Ratio Extractor from IFRS Reports")
st.write("""
Upload an IFRS financial report (PDF), and this app will automatically extract key financial metrics such as Revenue,
Net Income, Total Assets, and calculate important financial ratios like ROA, ROE, and Debt-to-Equity Ratio.
You can also ask questions about the financial data using Meta-Llama.
""")
# File uploader for PDF
uploaded_file = st.file_uploader("Upload your IFRS report (PDF)", type=["pdf"])
# If a PDF is uploaded
if uploaded_file:
st.write("Processing your document, please wait...")
# Extract text from PDF
pdf_text = extract_pdf_text(uploaded_file)
# Process the text through Meta-Llama for metrics extraction
extracted_metrics = process_pdf_text_for_metrics(pdf_text)
# Aggregate extracted metrics
aggregated_metrics = aggregate_metrics(extracted_metrics)
# Calculate financial ratios
financial_ratios = calculate_financial_ratios(aggregated_metrics)
# Display extracted financial ratios
st.subheader("Extracted Financial Ratios:")
if isinstance(financial_ratios, dict):
st.table(pd.DataFrame(financial_ratios.items(), columns=["Ratio", "Value"]))
else:
st.write(financial_ratios)
# Asking questions to Meta-Llama
st.subheader("Ask Meta-Llama about the extracted financial data:")
question = st.text_input("Enter your question here")
if st.button("Ask Meta-Llama"):
if question:
response = model(question)
st.write("Meta-Llama's Response:")
st.write(response[0]['generated_text'])
|