Spaces:
Running
Running
Upload 2 files
Browse files- app_stream.py +187 -0
- data_collector.py +399 -0
app_stream.py
ADDED
@@ -0,0 +1,187 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
import streamlit as st
|
2 |
+
# import wikipedia
|
3 |
+
from streamlit_lottie import st_lottie
|
4 |
+
import regex as re
|
5 |
+
from streamlit_js_eval import streamlit_js_eval
|
6 |
+
from common.utils import *
|
7 |
+
from data_collector import *
|
8 |
+
from langchain_community.llms import Ollama
|
9 |
+
|
10 |
+
from langchain_community.llms import Ollama
|
11 |
+
import pandas as pd
|
12 |
+
import json
|
13 |
+
st.set_page_config(layout="wide")
|
14 |
+
screen_width = streamlit_js_eval(label="screen.width",js_expressions='screen.width')
|
15 |
+
screen_height = streamlit_js_eval(label="screen.height",js_expressions='screen.height')
|
16 |
+
|
17 |
+
|
18 |
+
condition_capture = st.session_state
|
19 |
+
if 'schema' not in condition_capture:
|
20 |
+
condition_capture['schema'] = {}
|
21 |
+
if 'prompt' not in condition_capture:
|
22 |
+
condition_capture['prompt'] = ""
|
23 |
+
if "count" not in condition_capture:
|
24 |
+
condition_capture['count'] = 0
|
25 |
+
if "prev_schema" not in condition_capture:
|
26 |
+
condition_capture['prev_schema'] = {}
|
27 |
+
if "textual_value" not in condition_capture:
|
28 |
+
condition_capture['textual_value'] = {}
|
29 |
+
textual_value=None
|
30 |
+
|
31 |
+
|
32 |
+
|
33 |
+
schema=condition_capture['schema']
|
34 |
+
|
35 |
+
|
36 |
+
|
37 |
+
|
38 |
+
column1,column2 = st.columns(2)
|
39 |
+
with column2:
|
40 |
+
|
41 |
+
|
42 |
+
|
43 |
+
if len(condition_capture['schema'])!=0 and len(condition_capture['textual_value'])==0:
|
44 |
+
# condition_capture['prev_schema'] = condition_capture['schema']
|
45 |
+
condition_capture['textual_value']=relevent_value(str(condition_capture['schema']).lower(),50)
|
46 |
+
if len(condition_capture['schema'])!=0:
|
47 |
+
html_page = condition_capture['textual_value'][1]
|
48 |
+
textual_value = condition_capture['textual_value'][0]
|
49 |
+
st.write("<br>",unsafe_allow_html=True)
|
50 |
+
|
51 |
+
with st.container(border=True,height=int(screen_height/2.3)):
|
52 |
+
st.header("Wikipedia insights")
|
53 |
+
updated_schema = st.button("Start processing")
|
54 |
+
selector=st.empty()
|
55 |
+
write =st.empty()
|
56 |
+
start_page= selector.select_slider("Select a range of color wavelength",options=[i for i in html_page],key="start_page")
|
57 |
+
write.write(html_page[start_page],unsafe_allow_html=True)
|
58 |
+
|
59 |
+
|
60 |
+
|
61 |
+
|
62 |
+
|
63 |
+
# )
|
64 |
+
|
65 |
+
|
66 |
+
|
67 |
+
|
68 |
+
|
69 |
+
|
70 |
+
|
71 |
+
|
72 |
+
with column1:
|
73 |
+
|
74 |
+
if str(schema)!=str({}):
|
75 |
+
tabs = st.tabs(["Schema","Data Generation"])
|
76 |
+
with tabs[0]:
|
77 |
+
if str(schema)!=str({}):
|
78 |
+
|
79 |
+
schema_column1,schema_column2 = st.columns(2)
|
80 |
+
with schema_column1:
|
81 |
+
edited_df = st.data_editor([str(i) for index,i in enumerate(schema)],hide_index=True,use_container_width=True,num_rows='dynamic',height=int(screen_height/3))
|
82 |
+
|
83 |
+
with schema_column2:
|
84 |
+
number = st.number_input("Number of rows",min_value=1,max_value=1000,value=10)
|
85 |
+
if number!=condition_capture['count'] and updated_schema:
|
86 |
+
condition_capture['count'] = number
|
87 |
+
|
88 |
+
|
89 |
+
with open("animation\\edit_file.json") as animate:
|
90 |
+
url_json=json.load(animate)
|
91 |
+
st_lottie(url_json,height = int(screen_height/3))
|
92 |
+
|
93 |
+
|
94 |
+
with tabs[1]:
|
95 |
+
with open("animation\\no data animation.json") as animate:
|
96 |
+
url_json=json.load(animate)
|
97 |
+
dataframe=st.empty()
|
98 |
+
|
99 |
+
if condition_capture['count']==0:
|
100 |
+
st_lottie(url_json,height = int(screen_height/3))
|
101 |
+
|
102 |
+
else:
|
103 |
+
smart_append=[]
|
104 |
+
if condition_capture['prev_schema'] != condition_capture['schema']:
|
105 |
+
condition_capture['prev_schema'] = condition_capture['schema']
|
106 |
+
condition_capture['current_append']={}
|
107 |
+
|
108 |
+
for text_indexing,store in enumerate(actual_value(textual_value,schema)):
|
109 |
+
dummy_value =dictionary_formatting(store)
|
110 |
+
for keys in dummy_value:
|
111 |
+
while len(dummy_value[keys])>=2:
|
112 |
+
dummy_value[keys].pop(0)
|
113 |
+
dummy_value = dictionary_formatting(dummy_value)
|
114 |
+
|
115 |
+
if dummy_value != None:
|
116 |
+
|
117 |
+
|
118 |
+
smart_append.append(dummy_value)
|
119 |
+
print(dummy_value)
|
120 |
+
for keys in dummy_value:
|
121 |
+
if keys not in condition_capture['current_append']:
|
122 |
+
condition_capture['current_append'][str(keys)]=[]
|
123 |
+
condition_capture['current_append'][str(keys)].append(str([i for i in dummy_value[keys]]))
|
124 |
+
dataframe.dataframe(condition_capture['current_append'])
|
125 |
+
|
126 |
+
|
127 |
+
if len(condition_capture['current_append'][[i for i in condition_capture['current_append']][-1]])>=condition_capture['count']:
|
128 |
+
break
|
129 |
+
|
130 |
+
# print(dummy_value)
|
131 |
+
# if smart_check(dummy_value)!=True:
|
132 |
+
# smart_value=verification(dummy_value)
|
133 |
+
# if statement(condition_capture['schema'],smart_value):
|
134 |
+
# st.dataframe(smart_value)
|
135 |
+
condition_capture['current_append']={}
|
136 |
+
if len(smart_append)==0:
|
137 |
+
|
138 |
+
ranger=len(condition_capture['current_append'][[i for i in condition_capture['current_append']][0]])
|
139 |
+
for indexing in range(ranger):
|
140 |
+
working_dict = {}
|
141 |
+
for j in condition_capture['current_append']:
|
142 |
+
|
143 |
+
working_dict[j]=condition_capture['current_append'][j][indexing][0]
|
144 |
+
smart_append.append(working_dict)
|
145 |
+
smart_movement = sorting(smart_append)
|
146 |
+
|
147 |
+
for keys in smart_movement:
|
148 |
+
value=eval(keys)
|
149 |
+
for keys in value:
|
150 |
+
if keys not in condition_capture['current_append']:
|
151 |
+
condition_capture['current_append'][str(keys)]=[]
|
152 |
+
condition_capture['current_append'][str(keys)].append([str(i) for i in value[keys]])
|
153 |
+
dataframe.dataframe(condition_capture['current_append'])
|
154 |
+
for indexing,j in enumerate(smart_movement):
|
155 |
+
try:
|
156 |
+
# Convert string to dictionary
|
157 |
+
dummy_value = eval(j)
|
158 |
+
|
159 |
+
# Process dictionary values
|
160 |
+
for key in dummy_value:
|
161 |
+
while len(dummy_value[key]) >= 2:
|
162 |
+
dummy_value[key].pop(0)
|
163 |
+
|
164 |
+
# Format dictionary
|
165 |
+
formatted = dictionary_formatting(dummy_value)
|
166 |
+
print(formatted)
|
167 |
+
# Verify and store result
|
168 |
+
verification_result = verification(formatted) if formatted else None
|
169 |
+
for j in verification_result:
|
170 |
+
if j in condition_capture['current_append']:
|
171 |
+
condition_capture['current_append'][j][indexing]=[str(i) for i in verification_result[j]]
|
172 |
+
dataframe.dataframe(condition_capture['current_append'])
|
173 |
+
|
174 |
+
except:
|
175 |
+
pass
|
176 |
+
|
177 |
+
|
178 |
+
|
179 |
+
prompt = st.text_input(label="Please use prompt to generate data",value=condition_capture['prompt'])
|
180 |
+
if prompt != str(condition_capture['prompt']):
|
181 |
+
|
182 |
+
condition_capture['prompt'] = prompt
|
183 |
+
schema = schema_generator(prompt)
|
184 |
+
condition_capture['schema'] = schema
|
185 |
+
condition_capture['current_append']={}
|
186 |
+
|
187 |
+
st.rerun()
|
data_collector.py
ADDED
@@ -0,0 +1,399 @@
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
1 |
+
|
2 |
+
import wikipedia
|
3 |
+
import wikipediaapi
|
4 |
+
import regex as re
|
5 |
+
from sentence_transformers import SentenceTransformer,util
|
6 |
+
from transformers import pipeline
|
7 |
+
import requests
|
8 |
+
|
9 |
+
# def consume_llm_api(prompt):
|
10 |
+
# """
|
11 |
+
# Sends a prompt to the LLM API and processes the streamed response.
|
12 |
+
# """
|
13 |
+
# url = "https://3c93-70-167-32-130.ngrok-free.app/api/llm-response"
|
14 |
+
# headers = {"Content-Type": "application/json"}
|
15 |
+
# payload = {"prompt": prompt,"extension":"1"}
|
16 |
+
|
17 |
+
|
18 |
+
# print("Sending prompt to the LLM API...")
|
19 |
+
# response_ = requests.post(url, json=payload,verify=False)
|
20 |
+
# response_data = response_.json()
|
21 |
+
# return response_data['text']
|
22 |
+
def consume_llm_api(prompt):
|
23 |
+
model = Ollama(model="llama3:latest", temperature=0.3)
|
24 |
+
return model.invoke(prompt)
|
25 |
+
|
26 |
+
|
27 |
+
def relevent_value(long_query,count=3):
|
28 |
+
results = wikipedia.search(long_query,results=count)
|
29 |
+
|
30 |
+
wiki_wiki = wikipediaapi.Wikipedia(user_agent='MyProjectName ([email protected])', language='en',extract_format=wikipediaapi.ExtractFormat.WIKI)
|
31 |
+
wiki_wiki_html = wikipediaapi.Wikipedia(user_agent='MyProjectName ([email protected])', language='en',extract_format=wikipediaapi.ExtractFormat.HTML)
|
32 |
+
values={}
|
33 |
+
html_values={}
|
34 |
+
for result in results:
|
35 |
+
page_py = wiki_wiki.page(result)
|
36 |
+
page_html = wiki_wiki_html.page(result)
|
37 |
+
html_values[result]=page_html.text
|
38 |
+
|
39 |
+
values[result]=page_py.text
|
40 |
+
return values,html_values
|
41 |
+
|
42 |
+
|
43 |
+
from langchain_community.llms import Ollama
|
44 |
+
model=Ollama(model="llama3:latest",temperature=0.3)
|
45 |
+
agent_understanding = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
46 |
+
qa_model = pipeline('question-answering', model='deepset/roberta-base-squad2', tokenizer='deepset/roberta-base-squad2')
|
47 |
+
|
48 |
+
# textual_value
|
49 |
+
|
50 |
+
def construction_edit(textual_value,schema):
|
51 |
+
construction_prompt= textual_value+"\n"
|
52 |
+
construction_prompt+="Above is the generated text from wikipedia and below is the rule that has to be filled in the data. \n"
|
53 |
+
construction_prompt+="The data should be in the form of a dictionary and it must follow the following schema: \n"
|
54 |
+
construction_prompt+=str(schema)+"\n"
|
55 |
+
construction_prompt+="The length of each list of each key must be same in the generated data(mandatory)."+"\n"
|
56 |
+
construction_prompt+="No helper text like 'Here is the filled-in JSON schema based on the provided text' or 'Note: I've filled in the keys with relevant data' ."+ "\n"
|
57 |
+
construction_prompt+="The output must be a dictionary"+"\n"
|
58 |
+
constructed_text=consume_llm_api(construction_prompt)
|
59 |
+
return constructed_text
|
60 |
+
|
61 |
+
def dictionary_check(construction_edit):
|
62 |
+
for keys in construction_edit:
|
63 |
+
if len(construction_edit[keys])==0:
|
64 |
+
return False
|
65 |
+
return True
|
66 |
+
|
67 |
+
def actual_value(textual_value,schema):
|
68 |
+
for j in textual_value:
|
69 |
+
formatted_result = str(textual_value[j])+ "\n"
|
70 |
+
formatted_result += "Please fill the following schema with the relevant data from the text above."+ "\n"
|
71 |
+
formatted_result += "Here is the schema"+"\n"
|
72 |
+
formatted_result += str(schema)
|
73 |
+
formatted_result += "Please generate data according to schema and fill this template with your answers.\n"
|
74 |
+
formatted_result += "You have to fill each key with the relevant data from the text above."+ "\n"
|
75 |
+
formatted_result += "Please return the exact key value pair as the schema above. "+ "\n"
|
76 |
+
formatted_result += "No helper text like 'Here is the filled-in JSON schema based on the provided text' or 'Note: I've filled in the keys with relevant data' ."+ "\n"
|
77 |
+
formatted_result += "Only fill the keys that are in the schema."+ "\n"
|
78 |
+
formatted_result += "If you are not sure about the data, you can add 'Na'."+ "\n"
|
79 |
+
formatted_result += "It's an order you can not add any other text(e.g Here is the filled-in JSON schema) or note ."+ "\n"
|
80 |
+
formatted_result += "The length of each list of each key must be same in the generated data(mandatory)."+"\n"
|
81 |
+
raw_output = consume_llm_api(formatted_result)
|
82 |
+
try:
|
83 |
+
data=construction_edit(raw_output,schema)
|
84 |
+
json_object_match = re.search(r'\{(?:[^{}]|(?R))*\}', data)
|
85 |
+
access_value=eval(json_object_match.group())
|
86 |
+
for schema_key in schema:
|
87 |
+
if schema_key not in access_value:
|
88 |
+
access_value[schema_key]=list(set())
|
89 |
+
for schema_key in access_value:
|
90 |
+
access_value[schema_key]=list(set(access_value[schema_key]))
|
91 |
+
access_value[schema_key]=list(set(access_value[schema_key])-set(["Na"]))
|
92 |
+
yield access_value
|
93 |
+
|
94 |
+
except:
|
95 |
+
access_value=None
|
96 |
+
|
97 |
+
|
98 |
+
|
99 |
+
|
100 |
+
def context_data_relevancy(value,context):
|
101 |
+
researcher = "You are a professional reasearcher from data ."+ "\n"
|
102 |
+
researcher += "You have to check can we fill some of the missing values in the "+str(value) + ". \n"
|
103 |
+
researcher += "The possible part which available in the context has to be relevent with already present data"+ ". \n"
|
104 |
+
researcher += "from the context given below"+ ". \n"
|
105 |
+
researcher += context+ "\n"
|
106 |
+
researcher += "Be strict while thing of filling data"+ ". \n"
|
107 |
+
researcher += "Just return @yahoo@ if 90% possible else @NO@"+ ". \n"
|
108 |
+
|
109 |
+
|
110 |
+
result = consume_llm_api(researcher)
|
111 |
+
return result
|
112 |
+
|
113 |
+
def agent_work_result(query,value):
|
114 |
+
agent_understanding = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
|
115 |
+
query_embedding = agent_understanding.encode(query)
|
116 |
+
score1 = util.cos_sim(query_embedding,agent_understanding.encode("extract data for"))
|
117 |
+
score2 = util.cos_sim(query_embedding,agent_understanding.encode("append data in "))
|
118 |
+
score3 = util.cos_sim(query_embedding,agent_understanding.encode("check data"))
|
119 |
+
|
120 |
+
if score1 > score2 and score1 > score3:
|
121 |
+
# print("Extracting query:", query)
|
122 |
+
question = "search word ?"
|
123 |
+
result = qa_model(question=question, context=query)
|
124 |
+
result = result['answer']
|
125 |
+
print("Extracting query:", result)
|
126 |
+
wikisearch = relevent_value(result,3)
|
127 |
+
html_pages = wikisearch[1]
|
128 |
+
wikisearch = wikisearch[0]
|
129 |
+
|
130 |
+
for searches in wikisearch:
|
131 |
+
if "@yahoo@" in context_data_relevancy(value,wikisearch[searches]):
|
132 |
+
return wikisearch[searches]
|
133 |
+
return "No data found"
|
134 |
+
elif score2 > score1 and score2 > score3:
|
135 |
+
try:
|
136 |
+
print("Appending command:", query)
|
137 |
+
question1 = "which value we are adding to key ?"
|
138 |
+
result1 = qa_model(question=question1, context=query)
|
139 |
+
question2 = "In which key we are appending ?"
|
140 |
+
result2 = qa_model(question=question2, context=query)
|
141 |
+
result1 = result1['answer']
|
142 |
+
result2 = result2['answer']
|
143 |
+
|
144 |
+
if len(value[result2])==0:
|
145 |
+
value[result2].append(result1)
|
146 |
+
return "Now you can fill the remaining columns"
|
147 |
+
else:
|
148 |
+
return "You are putting value in the same key column again not accepted."
|
149 |
+
except Exception as e:
|
150 |
+
return str(e)
|
151 |
+
else:
|
152 |
+
min_=0
|
153 |
+
max_=0
|
154 |
+
for keys in value:
|
155 |
+
|
156 |
+
if len(value[keys])<min_:
|
157 |
+
min_=len(value[keys])
|
158 |
+
if len(value[keys])>max_:
|
159 |
+
max_=len(value[keys])
|
160 |
+
if min_==max_:
|
161 |
+
return "You dia a great job"
|
162 |
+
else:
|
163 |
+
return "Please append the data correctly so that the length of each key is same and data is also relevant"
|
164 |
+
|
165 |
+
|
166 |
+
def full_alignment(value):
|
167 |
+
for values in value:
|
168 |
+
if len(value[values])==0:
|
169 |
+
return False
|
170 |
+
return True
|
171 |
+
|
172 |
+
def query_formatting(result):
|
173 |
+
values=result.split("\n")
|
174 |
+
if len(values)!=0:
|
175 |
+
values.pop(0)
|
176 |
+
return values
|
177 |
+
def missing_value_completion(store,value):
|
178 |
+
|
179 |
+
filler_prompt = "Below is mentioned ajson data\n"
|
180 |
+
filler_prompt += str(value)+"\n"
|
181 |
+
filler_prompt += "you only need to find missing data from the mentioned context section."
|
182 |
+
filler_prompt += "You will return the results in below mentioned format.\n"
|
183 |
+
filler_prompt += "The output will be in json format."
|
184 |
+
filler_prompt += "context:\n"
|
185 |
+
|
186 |
+
for search_key in store:
|
187 |
+
try:
|
188 |
+
fill_text = store[search_key]
|
189 |
+
response = consume_llm_api(filler_prompt+fill_text)
|
190 |
+
|
191 |
+
json_object_match = re.search(r'\{(?:[^{}]|(?R))*\}', response)
|
192 |
+
access_value=eval(json_object_match.group())
|
193 |
+
for keys in value:
|
194 |
+
if len(value[keys])==0 and keys in access_value:
|
195 |
+
value[keys].append(access_value[keys].pop(0))
|
196 |
+
print(value)
|
197 |
+
if full_alignment(value):
|
198 |
+
return value
|
199 |
+
except:
|
200 |
+
pass
|
201 |
+
|
202 |
+
|
203 |
+
|
204 |
+
|
205 |
+
def verification(value):
|
206 |
+
|
207 |
+
|
208 |
+
validation_prompt = "Can you prepare a list of text(many as possible) that can be searched on google for filling(relevent data) the missing data below.\n"
|
209 |
+
validation_prompt += str(value)+"\n"
|
210 |
+
validation_prompt += "You need to prepare it by the following manner"
|
211 |
+
validation_prompt += "1. Mention it line by line.\n"
|
212 |
+
validation_prompt += "2. Please seperate it line by line.\n"
|
213 |
+
validation_prompt += "3. Headers are not required\n"
|
214 |
+
validation_prompt += "4. Please do not add any helper text example: Here is the required search queries , Here are the search queries .\n"
|
215 |
+
validation_prompt += "5. Please do not add any notes"
|
216 |
+
print("Searching for missing values")
|
217 |
+
result=query_formatting(consume_llm_api(validation_prompt))
|
218 |
+
|
219 |
+
for search_queries in result:
|
220 |
+
if len(search_queries)!=0:
|
221 |
+
print(search_queries)
|
222 |
+
store=relevent_value(search_queries)
|
223 |
+
html_pages = store[1]
|
224 |
+
store = store[0]
|
225 |
+
missing_value_completion(store,value)
|
226 |
+
if full_alignment(value):
|
227 |
+
return value
|
228 |
+
|
229 |
+
|
230 |
+
|
231 |
+
|
232 |
+
|
233 |
+
return result
|
234 |
+
|
235 |
+
def agent_data_prep(value,query):
|
236 |
+
end_result = ""
|
237 |
+
angent_earlier_income ="0"
|
238 |
+
pre_money_saving = "0"
|
239 |
+
mission = "First to fill most importent column \n"
|
240 |
+
while end_result!="You dia a great job":
|
241 |
+
|
242 |
+
if full_alignment(value):
|
243 |
+
return value
|
244 |
+
|
245 |
+
|
246 |
+
agent_instruction = mission
|
247 |
+
agent_instruction += "your previous income"+pre_money_saving+"\n"
|
248 |
+
agent_instruction += "your current income"+angent_earlier_income+"\n"
|
249 |
+
pre_money_saving = angent_earlier_income
|
250 |
+
if end_result=="You are putting value in the same key column again not accepted.":
|
251 |
+
|
252 |
+
mission = "Why you are always filling the"+[i for i in value][-1]+"only.\n"
|
253 |
+
mission += "We are removing $1000 from you account \n"
|
254 |
+
angent_earlier_income = str(int(angent_earlier_income)-1000)
|
255 |
+
agent_instruction += end_result + "\n" +"Above is the result of your previous command. Please give the next command to the agent."
|
256 |
+
agent_instruction += query + "\n"
|
257 |
+
agent_instruction += "Below is the data gathered upto now" + "\n"
|
258 |
+
agent_instruction += str(value) + "\n"
|
259 |
+
agent_instruction += "Please utilize the tool where you can command the agent to do any of the following tasks(one instruction at a time )"+ "\n"
|
260 |
+
agent_instruction += "You only have to fill one value for each key if its not present. \n"
|
261 |
+
agent_instruction += "From now onwards your each statement is understand as command which is categoried in any of the commands in mentioned below examples. \n"
|
262 |
+
agent_instruction += "1. Ask agent to extract data from the web about anything like search for lamp production ,smartphone parts etc .\n"
|
263 |
+
agent_instruction += "2. Give any specific value to append in current generated data . Please also mention the key in which the agent has to append the data .\n"
|
264 |
+
agent_instruction += "3. Ask the agent to put the generated data on check weather each column fills correctly or not .\n"
|
265 |
+
agent_instruction += "Here is the instruction to give commands to the agent. \n"
|
266 |
+
agent_instruction += "You can give commands to the agent ,few examples are mentioned below. \n"
|
267 |
+
|
268 |
+
agent_instruction += "1. Extract data about iron man suit or iron man suit mark1 \n"
|
269 |
+
agent_instruction += "(while thinking about extract data look into the data \n"
|
270 |
+
agent_instruction += "where data can be append and then search relevent query \n"
|
271 |
+
agent_instruction += "like green arrow from DC only if DC and green arraow is in different column key values )\n\n"
|
272 |
+
|
273 |
+
agent_instruction += "2. Append value 'bmw 4' to Car Model key \n"
|
274 |
+
agent_instruction += "(While appending the value you must have read the data from extract data command and remember, if you found anything relevent don't forget to append.\n"
|
275 |
+
agent_instruction += "The appending value has to be different not already present.) \n\n"
|
276 |
+
|
277 |
+
agent_instruction += "Any different grammatical version of the above commands. \n"
|
278 |
+
agent_instruction += "Command has to be given only for 'data filling' purpose. \n"
|
279 |
+
|
280 |
+
agent_instruction += "While command like search for or extract information about something it has to be relevent query search. \n"
|
281 |
+
agent_instruction += "The relevent the query the more accurate the data will be. \n"
|
282 |
+
agent_instruction += "Be cautious while filling the data It has to be correct. \n"
|
283 |
+
agent_instruction += "For each correct append you will get $1000. \n"
|
284 |
+
|
285 |
+
agent_instruction += "Give your command only no text . \n"
|
286 |
+
|
287 |
+
agent_instruction += "There will an audit after filling all the columns on data for its validity. \n"
|
288 |
+
agent_instruction += "Some mistakes are okay but But if we find you guilty there are some repercussion."
|
289 |
+
|
290 |
+
# instructionto give commands to the agent
|
291 |
+
|
292 |
+
judgement = Ollama(model = "llama3:latest")
|
293 |
+
command = judgement.invoke(agent_instruction)
|
294 |
+
|
295 |
+
end_result = agent_work_result(command,value)
|
296 |
+
if "Now you can fill the remaining columns" in end_result:
|
297 |
+
angent_earlier_income = str(int(angent_earlier_income)+1000)
|
298 |
+
print("--------------------")
|
299 |
+
print(value)
|
300 |
+
print("--------------------")
|
301 |
+
return value
|
302 |
+
|
303 |
+
def dictionary_formatting(value):
|
304 |
+
new_dict={}
|
305 |
+
for data_keys in [i for i in value]:
|
306 |
+
key_values = data_keys.strip()
|
307 |
+
if key_values in value:
|
308 |
+
if key_values not in new_dict:
|
309 |
+
new_dict[key_values] =[]
|
310 |
+
new_dict[key_values] = value.pop(key_values)
|
311 |
+
else:
|
312 |
+
new_dict[key_values] = value.pop(data_keys)
|
313 |
+
return new_dict
|
314 |
+
|
315 |
+
|
316 |
+
def schema_formatter(output):
|
317 |
+
schema = {i:[] for i in output.split(",")}
|
318 |
+
return schema
|
319 |
+
def schema_generator(query):
|
320 |
+
|
321 |
+
formatting = "The above statement is given by the user. Please create a single .csv-based schema by following the points below:\n"
|
322 |
+
|
323 |
+
formatting += "1. Only create the schema, no additional text or statement.\n"
|
324 |
+
|
325 |
+
formatting += "2. Keep the schema simple, avoid complex column names.\n"
|
326 |
+
|
327 |
+
formatting+= "3. please only generate 5 schema if not mentioned.\n"
|
328 |
+
|
329 |
+
formatting += "4. For example, if the user provides a statement like: 'Generate data for students getting placements from IIT Bombay,' the response should be:\n"
|
330 |
+
|
331 |
+
formatting += "Student Name, Student Roll Number, Student Branch, Student Year, Student Placement Status, Student Company Name, Student Package, Student Location, Student Role\n"
|
332 |
+
|
333 |
+
formatting += "Follow the above example but remember above is not actual schema you have to provide the schema depending on the user prompt.\n"
|
334 |
+
|
335 |
+
formatting+= "5. please only generate schema no notes or anything.\n"
|
336 |
+
|
337 |
+
output=consume_llm_api(query+"\n"+formatting)
|
338 |
+
|
339 |
+
return schema_formatter(output)
|
340 |
+
def sorting(data_dict):
|
341 |
+
new_dict={str(i):0 for i in data_dict}
|
342 |
+
|
343 |
+
for i in data_dict:
|
344 |
+
for j in i:
|
345 |
+
if len(i[j])!=0:
|
346 |
+
new_dict[str(i)] +=1
|
347 |
+
new_dict=[(new_dict[i],i) for i in new_dict]
|
348 |
+
new_dict.sort(reverse=True)
|
349 |
+
new_dict={i[-1]:i[0] for i in new_dict}
|
350 |
+
return new_dict
|
351 |
+
|
352 |
+
|
353 |
+
def process_data(query):
|
354 |
+
|
355 |
+
|
356 |
+
|
357 |
+
|
358 |
+
formatting = "The above statement is given by the user. Please create a single .csv-based schema by following the points below:\n"
|
359 |
+
formatting += "1. Only create the schema, no additional text or statement.\n"
|
360 |
+
formatting += "2. Keep the schema simple, avoid complex column names.\n"
|
361 |
+
formatting+= "3. please only generate 5 schema if not mentioned.\n"
|
362 |
+
formatting += "4. For example, if the user provides a statement like: 'Generate data for students getting placements from IIT Bombay,' the response should be:\n"
|
363 |
+
formatting += "Student Name, Student Roll Number, Student Branch, Student Year, Student Placement Status, Student Company Name, Student Package, Student Location, Student Role\n"
|
364 |
+
formatting += "Follow the above example but remember above is not actual schema you have to provide the schema depending on the user prompt.\n"
|
365 |
+
formatting+= "5. please only generate schema no notes or anything.\n"
|
366 |
+
print("Query:",query)
|
367 |
+
output=consume_llm_api(query+"\n"+formatting)
|
368 |
+
|
369 |
+
schema = {i:[] for i in output.split(",")}
|
370 |
+
textual_value=relevent_value(str(schema).lower(),3)
|
371 |
+
html_pages = textual_value[1]
|
372 |
+
textual_value = textual_value[0]
|
373 |
+
data_dict =[j for j in actual_value(textual_value,schema)]
|
374 |
+
for j in sorting(data_dict):
|
375 |
+
try:
|
376 |
+
# Convert string to dictionary
|
377 |
+
dummy_value = eval(j)
|
378 |
+
|
379 |
+
# Process dictionary values
|
380 |
+
for key in dummy_value:
|
381 |
+
while len(dummy_value[key]) >= 2:
|
382 |
+
dummy_value[key].pop(0)
|
383 |
+
|
384 |
+
# Format dictionary
|
385 |
+
formatted = dictionary_formatting(dummy_value)
|
386 |
+
print(formatted)
|
387 |
+
# Verify and store result
|
388 |
+
verification_result = verification(formatted) if formatted else None
|
389 |
+
|
390 |
+
yield verification_result
|
391 |
+
|
392 |
+
except Exception as e:
|
393 |
+
print(f"Error processing dictionary {j}: {e}")
|
394 |
+
|
395 |
+
|
396 |
+
# for j in process_data("Generate data for smart phones"):
|
397 |
+
# print(j)
|
398 |
+
|
399 |
+
|