Spaces:

pentarosarium
/

rdtest

Sleeping

File size: 6,766 Bytes

import streamlit as st
import fitz  # PyMuPDF for PDF processing
import pandas as pd
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline


# Get the Hugging Face token from the environment variables
hf_token = os.getenv("HF_API_TOKEN")


# Load the model (Meta-Llama 3.1 8B)
@st.cache_resource
def load_model():
    model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", use_auth_token=hf_token)
    return model

model = load_model()

# Function to extract text from PDF
def extract_pdf_text(file):
    doc = fitz.open(stream=file.read(), filetype="pdf")
    extracted_text = ""
    for page in doc:
        extracted_text += page.get_text("text")
    return extracted_text

# Function to chunk text into smaller sections
def chunk_text(text, max_tokens=1000):
    sentences = text.split('.')
    chunks = []
    current_chunk = ""
    current_token_count = 0

    for sentence in sentences:
        token_count = len(sentence.split())
        if current_token_count + token_count > max_tokens:
            chunks.append(current_chunk.strip())
            current_chunk = sentence
            current_token_count = token_count
        else:
            current_chunk += sentence + "."
            current_token_count += token_count

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

# Prompt generation for extracting financial data
def generate_extraction_prompt(chunk):
    return f"""

    From the following text, please extract the following financial metrics in IFRS format:

    - Revenue

    - Net Income

    - Total Assets

    - Total Liabilities

    - Shareholders' Equity

    - Current Assets

    - Current Liabilities



    If the information is not found in the text, return 'Not Available'.

    

    Text: {chunk}

    """

# Function to query Meta-Llama for each chunk
def extract_financial_metrics_from_chunk(chunk):
    prompt = generate_extraction_prompt(chunk)
    model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", use_auth_token=hf_token)
    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", use_auth_token=hf_token)
    nlp = pipeline("text-generation", model=model, tokenizer=tokenizer)
    response = nlp(prompt)
    return response[0]['generated_text']

# Process the PDF text through the model
def process_pdf_text_for_metrics(text):
    chunks = chunk_text(text)
    extracted_metrics = []

    for chunk in chunks:
        response = extract_financial_metrics_from_chunk(chunk)
        extracted_metrics.append(response)

    return extracted_metrics

# Function to parse the metrics from the model response
import re

def parse_metrics(extracted_text):
    metrics = {}
    for line in extracted_text.split("\n"):
        if "Revenue" in line:
            metrics['Revenue'] = re.findall(r'\d+', line)  # Find numeric data
        elif "Net Income" in line:
            metrics['Net Income'] = re.findall(r'\d+', line)
        elif "Total Assets" in line:
            metrics['Total Assets'] = re.findall(r'\d+', line)
        elif "Total Liabilities" in line:
            metrics['Total Liabilities'] = re.findall(r'\d+', line)
        elif "Shareholders' Equity" in line:
            metrics['Shareholders\' Equity'] = re.findall(r'\d+', line)
        elif "Current Assets" in line:
            metrics['Current Assets'] = re.findall(r'\d+', line)
        elif "Current Liabilities" in line:
            metrics['Current Liabilities'] = re.findall(r'\d+', line)

    return metrics

# Function to aggregate metrics from all chunks
def aggregate_metrics(extracted_metrics):
    aggregated_metrics = {
        "Revenue": None,
        "Net Income": None,
        "Total Assets": None,
        "Total Liabilities": None,
        "Shareholders' Equity": None,
        "Current Assets": None,
        "Current Liabilities": None
    }

    for metrics_text in extracted_metrics:
        parsed = parse_metrics(metrics_text)
        for key in parsed:
            if not aggregated_metrics[key]:
                aggregated_metrics[key] = parsed[key]

    return aggregated_metrics

# Function to calculate financial ratios
def calculate_financial_ratios(metrics):
    try:
        current_ratio = int(metrics['Current Assets'][0]) / int(metrics['Current Liabilities'][0])
        debt_to_equity = int(metrics['Total Liabilities'][0]) / int(metrics['Shareholders\' Equity'][0])
        roa = int(metrics['Net Income'][0]) / int(metrics['Total Assets'][0])
        roe = int(metrics['Net Income'][0]) / int(metrics['Shareholders\' Equity'][0])

        return {
            'Current Ratio': current_ratio,
            'Debt to Equity': debt_to_equity,
            'Return on Assets (ROA)': roa,
            'Return on Equity (ROE)': roe
        }
    except (TypeError, KeyError, IndexError):
        return "Some metrics were not extracted properly or are missing."

# Streamlit UI
st.title("Financial Ratio Extractor from IFRS Reports")

st.write("""

    Upload an IFRS financial report (PDF), and this app will automatically extract key financial metrics such as Revenue, 

    Net Income, Total Assets, and calculate important financial ratios like ROA, ROE, and Debt-to-Equity Ratio.

    You can also ask questions about the financial data using Meta-Llama.

""")

# File uploader for PDF
uploaded_file = st.file_uploader("Upload your IFRS report (PDF)", type=["pdf"])

# If a PDF is uploaded
if uploaded_file:
    st.write("Processing your document, please wait...")

    # Extract text from PDF
    pdf_text = extract_pdf_text(uploaded_file)
    
    # Process the text through Meta-Llama for metrics extraction
    extracted_metrics = process_pdf_text_for_metrics(pdf_text)
    
    # Aggregate extracted metrics
    aggregated_metrics = aggregate_metrics(extracted_metrics)
    
    # Calculate financial ratios
    financial_ratios = calculate_financial_ratios(aggregated_metrics)

    # Display extracted financial ratios
    st.subheader("Extracted Financial Ratios:")
    
    if isinstance(financial_ratios, dict):
        st.table(pd.DataFrame(financial_ratios.items(), columns=["Ratio", "Value"]))
    else:
        st.write(financial_ratios)

# Asking questions to Meta-Llama
st.subheader("Ask Meta-Llama about the extracted financial data:")

question = st.text_input("Enter your question here")

if st.button("Ask Meta-Llama"):
    if question:
        response = model(question)
        st.write("Meta-Llama's Response:")
        st.write(response[0]['generated_text'])