File size: 6,766 Bytes
6567c4a
 
 
27a89df
3d82a40
 
6567c4a
27a89df
 
d9de2eb
3d82a40
6567c4a
 
 
3d82a40
6567c4a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
3d82a40
 
 
 
6567c4a
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
2
3
4
5
6
7
8
9
10
11
12
13
14
15
16
17
18
19
20
21
22
23
24
25
26
27
28
29
30
31
32
33
34
35
36
37
38
39
40
41
42
43
44
45
46
47
48
49
50
51
52
53
54
55
56
57
58
59
60
61
62
63
64
65
66
67
68
69
70
71
72
73
74
75
76
77
78
79
80
81
82
83
84
85
86
87
88
89
90
91
92
93
94
95
96
97
98
99
100
101
102
103
104
105
106
107
108
109
110
111
112
113
114
115
116
117
118
119
120
121
122
123
124
125
126
127
128
129
130
131
132
133
134
135
136
137
138
139
140
141
142
143
144
145
146
147
148
149
150
151
152
153
154
155
156
157
158
159
160
161
162
163
164
165
166
167
168
169
170
171
172
173
174
175
176
177
178
179
180
181
182
183
184
185
186
187
188
189
190
191
192
193
import streamlit as st
import fitz  # PyMuPDF for PDF processing
import pandas as pd
import os
from transformers import AutoModelForCausalLM, AutoTokenizer, pipeline


# Get the Hugging Face token from the environment variables
hf_token = os.getenv("HF_API_TOKEN")


# Load the model (Meta-Llama 3.1 8B)
@st.cache_resource
def load_model():
    model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", use_auth_token=hf_token)
    return model

model = load_model()

# Function to extract text from PDF
def extract_pdf_text(file):
    doc = fitz.open(stream=file.read(), filetype="pdf")
    extracted_text = ""
    for page in doc:
        extracted_text += page.get_text("text")
    return extracted_text

# Function to chunk text into smaller sections
def chunk_text(text, max_tokens=1000):
    sentences = text.split('.')
    chunks = []
    current_chunk = ""
    current_token_count = 0

    for sentence in sentences:
        token_count = len(sentence.split())
        if current_token_count + token_count > max_tokens:
            chunks.append(current_chunk.strip())
            current_chunk = sentence
            current_token_count = token_count
        else:
            current_chunk += sentence + "."
            current_token_count += token_count

    if current_chunk:
        chunks.append(current_chunk.strip())

    return chunks

# Prompt generation for extracting financial data
def generate_extraction_prompt(chunk):
    return f"""

    From the following text, please extract the following financial metrics in IFRS format:

    - Revenue

    - Net Income

    - Total Assets

    - Total Liabilities

    - Shareholders' Equity

    - Current Assets

    - Current Liabilities



    If the information is not found in the text, return 'Not Available'.

    

    Text: {chunk}

    """

# Function to query Meta-Llama for each chunk
def extract_financial_metrics_from_chunk(chunk):
    prompt = generate_extraction_prompt(chunk)
    model = AutoModelForCausalLM.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", use_auth_token=hf_token)
    tokenizer = AutoTokenizer.from_pretrained("meta-llama/Meta-Llama-3.1-8B-Instruct", use_auth_token=hf_token)
    nlp = pipeline("text-generation", model=model, tokenizer=tokenizer)
    response = nlp(prompt)
    return response[0]['generated_text']

# Process the PDF text through the model
def process_pdf_text_for_metrics(text):
    chunks = chunk_text(text)
    extracted_metrics = []

    for chunk in chunks:
        response = extract_financial_metrics_from_chunk(chunk)
        extracted_metrics.append(response)

    return extracted_metrics

# Function to parse the metrics from the model response
import re

def parse_metrics(extracted_text):
    metrics = {}
    for line in extracted_text.split("\n"):
        if "Revenue" in line:
            metrics['Revenue'] = re.findall(r'\d+', line)  # Find numeric data
        elif "Net Income" in line:
            metrics['Net Income'] = re.findall(r'\d+', line)
        elif "Total Assets" in line:
            metrics['Total Assets'] = re.findall(r'\d+', line)
        elif "Total Liabilities" in line:
            metrics['Total Liabilities'] = re.findall(r'\d+', line)
        elif "Shareholders' Equity" in line:
            metrics['Shareholders\' Equity'] = re.findall(r'\d+', line)
        elif "Current Assets" in line:
            metrics['Current Assets'] = re.findall(r'\d+', line)
        elif "Current Liabilities" in line:
            metrics['Current Liabilities'] = re.findall(r'\d+', line)

    return metrics

# Function to aggregate metrics from all chunks
def aggregate_metrics(extracted_metrics):
    aggregated_metrics = {
        "Revenue": None,
        "Net Income": None,
        "Total Assets": None,
        "Total Liabilities": None,
        "Shareholders' Equity": None,
        "Current Assets": None,
        "Current Liabilities": None
    }

    for metrics_text in extracted_metrics:
        parsed = parse_metrics(metrics_text)
        for key in parsed:
            if not aggregated_metrics[key]:
                aggregated_metrics[key] = parsed[key]

    return aggregated_metrics

# Function to calculate financial ratios
def calculate_financial_ratios(metrics):
    try:
        current_ratio = int(metrics['Current Assets'][0]) / int(metrics['Current Liabilities'][0])
        debt_to_equity = int(metrics['Total Liabilities'][0]) / int(metrics['Shareholders\' Equity'][0])
        roa = int(metrics['Net Income'][0]) / int(metrics['Total Assets'][0])
        roe = int(metrics['Net Income'][0]) / int(metrics['Shareholders\' Equity'][0])

        return {
            'Current Ratio': current_ratio,
            'Debt to Equity': debt_to_equity,
            'Return on Assets (ROA)': roa,
            'Return on Equity (ROE)': roe
        }
    except (TypeError, KeyError, IndexError):
        return "Some metrics were not extracted properly or are missing."

# Streamlit UI
st.title("Financial Ratio Extractor from IFRS Reports")

st.write("""

    Upload an IFRS financial report (PDF), and this app will automatically extract key financial metrics such as Revenue, 

    Net Income, Total Assets, and calculate important financial ratios like ROA, ROE, and Debt-to-Equity Ratio.

    You can also ask questions about the financial data using Meta-Llama.

""")

# File uploader for PDF
uploaded_file = st.file_uploader("Upload your IFRS report (PDF)", type=["pdf"])

# If a PDF is uploaded
if uploaded_file:
    st.write("Processing your document, please wait...")

    # Extract text from PDF
    pdf_text = extract_pdf_text(uploaded_file)
    
    # Process the text through Meta-Llama for metrics extraction
    extracted_metrics = process_pdf_text_for_metrics(pdf_text)
    
    # Aggregate extracted metrics
    aggregated_metrics = aggregate_metrics(extracted_metrics)
    
    # Calculate financial ratios
    financial_ratios = calculate_financial_ratios(aggregated_metrics)

    # Display extracted financial ratios
    st.subheader("Extracted Financial Ratios:")
    
    if isinstance(financial_ratios, dict):
        st.table(pd.DataFrame(financial_ratios.items(), columns=["Ratio", "Value"]))
    else:
        st.write(financial_ratios)

# Asking questions to Meta-Llama
st.subheader("Ask Meta-Llama about the extracted financial data:")

question = st.text_input("Enter your question here")

if st.button("Ask Meta-Llama"):
    if question:
        response = model(question)
        st.write("Meta-Llama's Response:")
        st.write(response[0]['generated_text'])