warhawkmonk commited on
Commit
1294c62
·
verified ·
1 Parent(s): e57a5e2

Upload 2 files

Browse files
Files changed (2) hide show
  1. app_stream.py +187 -0
  2. data_collector.py +399 -0
app_stream.py ADDED
@@ -0,0 +1,187 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ # import wikipedia
3
+ from streamlit_lottie import st_lottie
4
+ import regex as re
5
+ from streamlit_js_eval import streamlit_js_eval
6
+ from common.utils import *
7
+ from data_collector import *
8
+ from langchain_community.llms import Ollama
9
+
10
+ from langchain_community.llms import Ollama
11
+ import pandas as pd
12
+ import json
13
+ st.set_page_config(layout="wide")
14
+ screen_width = streamlit_js_eval(label="screen.width",js_expressions='screen.width')
15
+ screen_height = streamlit_js_eval(label="screen.height",js_expressions='screen.height')
16
+
17
+
18
+ condition_capture = st.session_state
19
+ if 'schema' not in condition_capture:
20
+ condition_capture['schema'] = {}
21
+ if 'prompt' not in condition_capture:
22
+ condition_capture['prompt'] = ""
23
+ if "count" not in condition_capture:
24
+ condition_capture['count'] = 0
25
+ if "prev_schema" not in condition_capture:
26
+ condition_capture['prev_schema'] = {}
27
+ if "textual_value" not in condition_capture:
28
+ condition_capture['textual_value'] = {}
29
+ textual_value=None
30
+
31
+
32
+
33
+ schema=condition_capture['schema']
34
+
35
+
36
+
37
+
38
+ column1,column2 = st.columns(2)
39
+ with column2:
40
+
41
+
42
+
43
+ if len(condition_capture['schema'])!=0 and len(condition_capture['textual_value'])==0:
44
+ # condition_capture['prev_schema'] = condition_capture['schema']
45
+ condition_capture['textual_value']=relevent_value(str(condition_capture['schema']).lower(),50)
46
+ if len(condition_capture['schema'])!=0:
47
+ html_page = condition_capture['textual_value'][1]
48
+ textual_value = condition_capture['textual_value'][0]
49
+ st.write("<br>",unsafe_allow_html=True)
50
+
51
+ with st.container(border=True,height=int(screen_height/2.3)):
52
+ st.header("Wikipedia insights")
53
+ updated_schema = st.button("Start processing")
54
+ selector=st.empty()
55
+ write =st.empty()
56
+ start_page= selector.select_slider("Select a range of color wavelength",options=[i for i in html_page],key="start_page")
57
+ write.write(html_page[start_page],unsafe_allow_html=True)
58
+
59
+
60
+
61
+
62
+
63
+ # )
64
+
65
+
66
+
67
+
68
+
69
+
70
+
71
+
72
+ with column1:
73
+
74
+ if str(schema)!=str({}):
75
+ tabs = st.tabs(["Schema","Data Generation"])
76
+ with tabs[0]:
77
+ if str(schema)!=str({}):
78
+
79
+ schema_column1,schema_column2 = st.columns(2)
80
+ with schema_column1:
81
+ edited_df = st.data_editor([str(i) for index,i in enumerate(schema)],hide_index=True,use_container_width=True,num_rows='dynamic',height=int(screen_height/3))
82
+
83
+ with schema_column2:
84
+ number = st.number_input("Number of rows",min_value=1,max_value=1000,value=10)
85
+ if number!=condition_capture['count'] and updated_schema:
86
+ condition_capture['count'] = number
87
+
88
+
89
+ with open("animation\\edit_file.json") as animate:
90
+ url_json=json.load(animate)
91
+ st_lottie(url_json,height = int(screen_height/3))
92
+
93
+
94
+ with tabs[1]:
95
+ with open("animation\\no data animation.json") as animate:
96
+ url_json=json.load(animate)
97
+ dataframe=st.empty()
98
+
99
+ if condition_capture['count']==0:
100
+ st_lottie(url_json,height = int(screen_height/3))
101
+
102
+ else:
103
+ smart_append=[]
104
+ if condition_capture['prev_schema'] != condition_capture['schema']:
105
+ condition_capture['prev_schema'] = condition_capture['schema']
106
+ condition_capture['current_append']={}
107
+
108
+ for text_indexing,store in enumerate(actual_value(textual_value,schema)):
109
+ dummy_value =dictionary_formatting(store)
110
+ for keys in dummy_value:
111
+ while len(dummy_value[keys])>=2:
112
+ dummy_value[keys].pop(0)
113
+ dummy_value = dictionary_formatting(dummy_value)
114
+
115
+ if dummy_value != None:
116
+
117
+
118
+ smart_append.append(dummy_value)
119
+ print(dummy_value)
120
+ for keys in dummy_value:
121
+ if keys not in condition_capture['current_append']:
122
+ condition_capture['current_append'][str(keys)]=[]
123
+ condition_capture['current_append'][str(keys)].append(str([i for i in dummy_value[keys]]))
124
+ dataframe.dataframe(condition_capture['current_append'])
125
+
126
+
127
+ if len(condition_capture['current_append'][[i for i in condition_capture['current_append']][-1]])>=condition_capture['count']:
128
+ break
129
+
130
+ # print(dummy_value)
131
+ # if smart_check(dummy_value)!=True:
132
+ # smart_value=verification(dummy_value)
133
+ # if statement(condition_capture['schema'],smart_value):
134
+ # st.dataframe(smart_value)
135
+ condition_capture['current_append']={}
136
+ if len(smart_append)==0:
137
+
138
+ ranger=len(condition_capture['current_append'][[i for i in condition_capture['current_append']][0]])
139
+ for indexing in range(ranger):
140
+ working_dict = {}
141
+ for j in condition_capture['current_append']:
142
+
143
+ working_dict[j]=condition_capture['current_append'][j][indexing][0]
144
+ smart_append.append(working_dict)
145
+ smart_movement = sorting(smart_append)
146
+
147
+ for keys in smart_movement:
148
+ value=eval(keys)
149
+ for keys in value:
150
+ if keys not in condition_capture['current_append']:
151
+ condition_capture['current_append'][str(keys)]=[]
152
+ condition_capture['current_append'][str(keys)].append([str(i) for i in value[keys]])
153
+ dataframe.dataframe(condition_capture['current_append'])
154
+ for indexing,j in enumerate(smart_movement):
155
+ try:
156
+ # Convert string to dictionary
157
+ dummy_value = eval(j)
158
+
159
+ # Process dictionary values
160
+ for key in dummy_value:
161
+ while len(dummy_value[key]) >= 2:
162
+ dummy_value[key].pop(0)
163
+
164
+ # Format dictionary
165
+ formatted = dictionary_formatting(dummy_value)
166
+ print(formatted)
167
+ # Verify and store result
168
+ verification_result = verification(formatted) if formatted else None
169
+ for j in verification_result:
170
+ if j in condition_capture['current_append']:
171
+ condition_capture['current_append'][j][indexing]=[str(i) for i in verification_result[j]]
172
+ dataframe.dataframe(condition_capture['current_append'])
173
+
174
+ except:
175
+ pass
176
+
177
+
178
+
179
+ prompt = st.text_input(label="Please use prompt to generate data",value=condition_capture['prompt'])
180
+ if prompt != str(condition_capture['prompt']):
181
+
182
+ condition_capture['prompt'] = prompt
183
+ schema = schema_generator(prompt)
184
+ condition_capture['schema'] = schema
185
+ condition_capture['current_append']={}
186
+
187
+ st.rerun()
data_collector.py ADDED
@@ -0,0 +1,399 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+
2
+ import wikipedia
3
+ import wikipediaapi
4
+ import regex as re
5
+ from sentence_transformers import SentenceTransformer,util
6
+ from transformers import pipeline
7
+ import requests
8
+
9
+ # def consume_llm_api(prompt):
10
+ # """
11
+ # Sends a prompt to the LLM API and processes the streamed response.
12
+ # """
13
+ # url = "https://3c93-70-167-32-130.ngrok-free.app/api/llm-response"
14
+ # headers = {"Content-Type": "application/json"}
15
+ # payload = {"prompt": prompt,"extension":"1"}
16
+
17
+
18
+ # print("Sending prompt to the LLM API...")
19
+ # response_ = requests.post(url, json=payload,verify=False)
20
+ # response_data = response_.json()
21
+ # return response_data['text']
22
+ def consume_llm_api(prompt):
23
+ model = Ollama(model="llama3:latest", temperature=0.3)
24
+ return model.invoke(prompt)
25
+
26
+
27
+ def relevent_value(long_query,count=3):
28
+ results = wikipedia.search(long_query,results=count)
29
+
30
+ wiki_wiki = wikipediaapi.Wikipedia(user_agent='MyProjectName ([email protected])', language='en',extract_format=wikipediaapi.ExtractFormat.WIKI)
31
+ wiki_wiki_html = wikipediaapi.Wikipedia(user_agent='MyProjectName ([email protected])', language='en',extract_format=wikipediaapi.ExtractFormat.HTML)
32
+ values={}
33
+ html_values={}
34
+ for result in results:
35
+ page_py = wiki_wiki.page(result)
36
+ page_html = wiki_wiki_html.page(result)
37
+ html_values[result]=page_html.text
38
+
39
+ values[result]=page_py.text
40
+ return values,html_values
41
+
42
+
43
+ from langchain_community.llms import Ollama
44
+ model=Ollama(model="llama3:latest",temperature=0.3)
45
+ agent_understanding = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
46
+ qa_model = pipeline('question-answering', model='deepset/roberta-base-squad2', tokenizer='deepset/roberta-base-squad2')
47
+
48
+ # textual_value
49
+
50
+ def construction_edit(textual_value,schema):
51
+ construction_prompt= textual_value+"\n"
52
+ construction_prompt+="Above is the generated text from wikipedia and below is the rule that has to be filled in the data. \n"
53
+ construction_prompt+="The data should be in the form of a dictionary and it must follow the following schema: \n"
54
+ construction_prompt+=str(schema)+"\n"
55
+ construction_prompt+="The length of each list of each key must be same in the generated data(mandatory)."+"\n"
56
+ construction_prompt+="No helper text like 'Here is the filled-in JSON schema based on the provided text' or 'Note: I've filled in the keys with relevant data' ."+ "\n"
57
+ construction_prompt+="The output must be a dictionary"+"\n"
58
+ constructed_text=consume_llm_api(construction_prompt)
59
+ return constructed_text
60
+
61
+ def dictionary_check(construction_edit):
62
+ for keys in construction_edit:
63
+ if len(construction_edit[keys])==0:
64
+ return False
65
+ return True
66
+
67
+ def actual_value(textual_value,schema):
68
+ for j in textual_value:
69
+ formatted_result = str(textual_value[j])+ "\n"
70
+ formatted_result += "Please fill the following schema with the relevant data from the text above."+ "\n"
71
+ formatted_result += "Here is the schema"+"\n"
72
+ formatted_result += str(schema)
73
+ formatted_result += "Please generate data according to schema and fill this template with your answers.\n"
74
+ formatted_result += "You have to fill each key with the relevant data from the text above."+ "\n"
75
+ formatted_result += "Please return the exact key value pair as the schema above. "+ "\n"
76
+ formatted_result += "No helper text like 'Here is the filled-in JSON schema based on the provided text' or 'Note: I've filled in the keys with relevant data' ."+ "\n"
77
+ formatted_result += "Only fill the keys that are in the schema."+ "\n"
78
+ formatted_result += "If you are not sure about the data, you can add 'Na'."+ "\n"
79
+ formatted_result += "It's an order you can not add any other text(e.g Here is the filled-in JSON schema) or note ."+ "\n"
80
+ formatted_result += "The length of each list of each key must be same in the generated data(mandatory)."+"\n"
81
+ raw_output = consume_llm_api(formatted_result)
82
+ try:
83
+ data=construction_edit(raw_output,schema)
84
+ json_object_match = re.search(r'\{(?:[^{}]|(?R))*\}', data)
85
+ access_value=eval(json_object_match.group())
86
+ for schema_key in schema:
87
+ if schema_key not in access_value:
88
+ access_value[schema_key]=list(set())
89
+ for schema_key in access_value:
90
+ access_value[schema_key]=list(set(access_value[schema_key]))
91
+ access_value[schema_key]=list(set(access_value[schema_key])-set(["Na"]))
92
+ yield access_value
93
+
94
+ except:
95
+ access_value=None
96
+
97
+
98
+
99
+
100
+ def context_data_relevancy(value,context):
101
+ researcher = "You are a professional reasearcher from data ."+ "\n"
102
+ researcher += "You have to check can we fill some of the missing values in the "+str(value) + ". \n"
103
+ researcher += "The possible part which available in the context has to be relevent with already present data"+ ". \n"
104
+ researcher += "from the context given below"+ ". \n"
105
+ researcher += context+ "\n"
106
+ researcher += "Be strict while thing of filling data"+ ". \n"
107
+ researcher += "Just return @yahoo@ if 90% possible else @NO@"+ ". \n"
108
+
109
+
110
+ result = consume_llm_api(researcher)
111
+ return result
112
+
113
+ def agent_work_result(query,value):
114
+ agent_understanding = SentenceTransformer('sentence-transformers/all-MiniLM-L6-v2')
115
+ query_embedding = agent_understanding.encode(query)
116
+ score1 = util.cos_sim(query_embedding,agent_understanding.encode("extract data for"))
117
+ score2 = util.cos_sim(query_embedding,agent_understanding.encode("append data in "))
118
+ score3 = util.cos_sim(query_embedding,agent_understanding.encode("check data"))
119
+
120
+ if score1 > score2 and score1 > score3:
121
+ # print("Extracting query:", query)
122
+ question = "search word ?"
123
+ result = qa_model(question=question, context=query)
124
+ result = result['answer']
125
+ print("Extracting query:", result)
126
+ wikisearch = relevent_value(result,3)
127
+ html_pages = wikisearch[1]
128
+ wikisearch = wikisearch[0]
129
+
130
+ for searches in wikisearch:
131
+ if "@yahoo@" in context_data_relevancy(value,wikisearch[searches]):
132
+ return wikisearch[searches]
133
+ return "No data found"
134
+ elif score2 > score1 and score2 > score3:
135
+ try:
136
+ print("Appending command:", query)
137
+ question1 = "which value we are adding to key ?"
138
+ result1 = qa_model(question=question1, context=query)
139
+ question2 = "In which key we are appending ?"
140
+ result2 = qa_model(question=question2, context=query)
141
+ result1 = result1['answer']
142
+ result2 = result2['answer']
143
+
144
+ if len(value[result2])==0:
145
+ value[result2].append(result1)
146
+ return "Now you can fill the remaining columns"
147
+ else:
148
+ return "You are putting value in the same key column again not accepted."
149
+ except Exception as e:
150
+ return str(e)
151
+ else:
152
+ min_=0
153
+ max_=0
154
+ for keys in value:
155
+
156
+ if len(value[keys])<min_:
157
+ min_=len(value[keys])
158
+ if len(value[keys])>max_:
159
+ max_=len(value[keys])
160
+ if min_==max_:
161
+ return "You dia a great job"
162
+ else:
163
+ return "Please append the data correctly so that the length of each key is same and data is also relevant"
164
+
165
+
166
+ def full_alignment(value):
167
+ for values in value:
168
+ if len(value[values])==0:
169
+ return False
170
+ return True
171
+
172
+ def query_formatting(result):
173
+ values=result.split("\n")
174
+ if len(values)!=0:
175
+ values.pop(0)
176
+ return values
177
+ def missing_value_completion(store,value):
178
+
179
+ filler_prompt = "Below is mentioned ajson data\n"
180
+ filler_prompt += str(value)+"\n"
181
+ filler_prompt += "you only need to find missing data from the mentioned context section."
182
+ filler_prompt += "You will return the results in below mentioned format.\n"
183
+ filler_prompt += "The output will be in json format."
184
+ filler_prompt += "context:\n"
185
+
186
+ for search_key in store:
187
+ try:
188
+ fill_text = store[search_key]
189
+ response = consume_llm_api(filler_prompt+fill_text)
190
+
191
+ json_object_match = re.search(r'\{(?:[^{}]|(?R))*\}', response)
192
+ access_value=eval(json_object_match.group())
193
+ for keys in value:
194
+ if len(value[keys])==0 and keys in access_value:
195
+ value[keys].append(access_value[keys].pop(0))
196
+ print(value)
197
+ if full_alignment(value):
198
+ return value
199
+ except:
200
+ pass
201
+
202
+
203
+
204
+
205
+ def verification(value):
206
+
207
+
208
+ validation_prompt = "Can you prepare a list of text(many as possible) that can be searched on google for filling(relevent data) the missing data below.\n"
209
+ validation_prompt += str(value)+"\n"
210
+ validation_prompt += "You need to prepare it by the following manner"
211
+ validation_prompt += "1. Mention it line by line.\n"
212
+ validation_prompt += "2. Please seperate it line by line.\n"
213
+ validation_prompt += "3. Headers are not required\n"
214
+ validation_prompt += "4. Please do not add any helper text example: Here is the required search queries , Here are the search queries .\n"
215
+ validation_prompt += "5. Please do not add any notes"
216
+ print("Searching for missing values")
217
+ result=query_formatting(consume_llm_api(validation_prompt))
218
+
219
+ for search_queries in result:
220
+ if len(search_queries)!=0:
221
+ print(search_queries)
222
+ store=relevent_value(search_queries)
223
+ html_pages = store[1]
224
+ store = store[0]
225
+ missing_value_completion(store,value)
226
+ if full_alignment(value):
227
+ return value
228
+
229
+
230
+
231
+
232
+
233
+ return result
234
+
235
+ def agent_data_prep(value,query):
236
+ end_result = ""
237
+ angent_earlier_income ="0"
238
+ pre_money_saving = "0"
239
+ mission = "First to fill most importent column \n"
240
+ while end_result!="You dia a great job":
241
+
242
+ if full_alignment(value):
243
+ return value
244
+
245
+
246
+ agent_instruction = mission
247
+ agent_instruction += "your previous income"+pre_money_saving+"\n"
248
+ agent_instruction += "your current income"+angent_earlier_income+"\n"
249
+ pre_money_saving = angent_earlier_income
250
+ if end_result=="You are putting value in the same key column again not accepted.":
251
+
252
+ mission = "Why you are always filling the"+[i for i in value][-1]+"only.\n"
253
+ mission += "We are removing $1000 from you account \n"
254
+ angent_earlier_income = str(int(angent_earlier_income)-1000)
255
+ agent_instruction += end_result + "\n" +"Above is the result of your previous command. Please give the next command to the agent."
256
+ agent_instruction += query + "\n"
257
+ agent_instruction += "Below is the data gathered upto now" + "\n"
258
+ agent_instruction += str(value) + "\n"
259
+ agent_instruction += "Please utilize the tool where you can command the agent to do any of the following tasks(one instruction at a time )"+ "\n"
260
+ agent_instruction += "You only have to fill one value for each key if its not present. \n"
261
+ agent_instruction += "From now onwards your each statement is understand as command which is categoried in any of the commands in mentioned below examples. \n"
262
+ agent_instruction += "1. Ask agent to extract data from the web about anything like search for lamp production ,smartphone parts etc .\n"
263
+ agent_instruction += "2. Give any specific value to append in current generated data . Please also mention the key in which the agent has to append the data .\n"
264
+ agent_instruction += "3. Ask the agent to put the generated data on check weather each column fills correctly or not .\n"
265
+ agent_instruction += "Here is the instruction to give commands to the agent. \n"
266
+ agent_instruction += "You can give commands to the agent ,few examples are mentioned below. \n"
267
+
268
+ agent_instruction += "1. Extract data about iron man suit or iron man suit mark1 \n"
269
+ agent_instruction += "(while thinking about extract data look into the data \n"
270
+ agent_instruction += "where data can be append and then search relevent query \n"
271
+ agent_instruction += "like green arrow from DC only if DC and green arraow is in different column key values )\n\n"
272
+
273
+ agent_instruction += "2. Append value 'bmw 4' to Car Model key \n"
274
+ agent_instruction += "(While appending the value you must have read the data from extract data command and remember, if you found anything relevent don't forget to append.\n"
275
+ agent_instruction += "The appending value has to be different not already present.) \n\n"
276
+
277
+ agent_instruction += "Any different grammatical version of the above commands. \n"
278
+ agent_instruction += "Command has to be given only for 'data filling' purpose. \n"
279
+
280
+ agent_instruction += "While command like search for or extract information about something it has to be relevent query search. \n"
281
+ agent_instruction += "The relevent the query the more accurate the data will be. \n"
282
+ agent_instruction += "Be cautious while filling the data It has to be correct. \n"
283
+ agent_instruction += "For each correct append you will get $1000. \n"
284
+
285
+ agent_instruction += "Give your command only no text . \n"
286
+
287
+ agent_instruction += "There will an audit after filling all the columns on data for its validity. \n"
288
+ agent_instruction += "Some mistakes are okay but But if we find you guilty there are some repercussion."
289
+
290
+ # instructionto give commands to the agent
291
+
292
+ judgement = Ollama(model = "llama3:latest")
293
+ command = judgement.invoke(agent_instruction)
294
+
295
+ end_result = agent_work_result(command,value)
296
+ if "Now you can fill the remaining columns" in end_result:
297
+ angent_earlier_income = str(int(angent_earlier_income)+1000)
298
+ print("--------------------")
299
+ print(value)
300
+ print("--------------------")
301
+ return value
302
+
303
+ def dictionary_formatting(value):
304
+ new_dict={}
305
+ for data_keys in [i for i in value]:
306
+ key_values = data_keys.strip()
307
+ if key_values in value:
308
+ if key_values not in new_dict:
309
+ new_dict[key_values] =[]
310
+ new_dict[key_values] = value.pop(key_values)
311
+ else:
312
+ new_dict[key_values] = value.pop(data_keys)
313
+ return new_dict
314
+
315
+
316
+ def schema_formatter(output):
317
+ schema = {i:[] for i in output.split(",")}
318
+ return schema
319
+ def schema_generator(query):
320
+
321
+ formatting = "The above statement is given by the user. Please create a single .csv-based schema by following the points below:\n"
322
+
323
+ formatting += "1. Only create the schema, no additional text or statement.\n"
324
+
325
+ formatting += "2. Keep the schema simple, avoid complex column names.\n"
326
+
327
+ formatting+= "3. please only generate 5 schema if not mentioned.\n"
328
+
329
+ formatting += "4. For example, if the user provides a statement like: 'Generate data for students getting placements from IIT Bombay,' the response should be:\n"
330
+
331
+ formatting += "Student Name, Student Roll Number, Student Branch, Student Year, Student Placement Status, Student Company Name, Student Package, Student Location, Student Role\n"
332
+
333
+ formatting += "Follow the above example but remember above is not actual schema you have to provide the schema depending on the user prompt.\n"
334
+
335
+ formatting+= "5. please only generate schema no notes or anything.\n"
336
+
337
+ output=consume_llm_api(query+"\n"+formatting)
338
+
339
+ return schema_formatter(output)
340
+ def sorting(data_dict):
341
+ new_dict={str(i):0 for i in data_dict}
342
+
343
+ for i in data_dict:
344
+ for j in i:
345
+ if len(i[j])!=0:
346
+ new_dict[str(i)] +=1
347
+ new_dict=[(new_dict[i],i) for i in new_dict]
348
+ new_dict.sort(reverse=True)
349
+ new_dict={i[-1]:i[0] for i in new_dict}
350
+ return new_dict
351
+
352
+
353
+ def process_data(query):
354
+
355
+
356
+
357
+
358
+ formatting = "The above statement is given by the user. Please create a single .csv-based schema by following the points below:\n"
359
+ formatting += "1. Only create the schema, no additional text or statement.\n"
360
+ formatting += "2. Keep the schema simple, avoid complex column names.\n"
361
+ formatting+= "3. please only generate 5 schema if not mentioned.\n"
362
+ formatting += "4. For example, if the user provides a statement like: 'Generate data for students getting placements from IIT Bombay,' the response should be:\n"
363
+ formatting += "Student Name, Student Roll Number, Student Branch, Student Year, Student Placement Status, Student Company Name, Student Package, Student Location, Student Role\n"
364
+ formatting += "Follow the above example but remember above is not actual schema you have to provide the schema depending on the user prompt.\n"
365
+ formatting+= "5. please only generate schema no notes or anything.\n"
366
+ print("Query:",query)
367
+ output=consume_llm_api(query+"\n"+formatting)
368
+
369
+ schema = {i:[] for i in output.split(",")}
370
+ textual_value=relevent_value(str(schema).lower(),3)
371
+ html_pages = textual_value[1]
372
+ textual_value = textual_value[0]
373
+ data_dict =[j for j in actual_value(textual_value,schema)]
374
+ for j in sorting(data_dict):
375
+ try:
376
+ # Convert string to dictionary
377
+ dummy_value = eval(j)
378
+
379
+ # Process dictionary values
380
+ for key in dummy_value:
381
+ while len(dummy_value[key]) >= 2:
382
+ dummy_value[key].pop(0)
383
+
384
+ # Format dictionary
385
+ formatted = dictionary_formatting(dummy_value)
386
+ print(formatted)
387
+ # Verify and store result
388
+ verification_result = verification(formatted) if formatted else None
389
+
390
+ yield verification_result
391
+
392
+ except Exception as e:
393
+ print(f"Error processing dictionary {j}: {e}")
394
+
395
+
396
+ # for j in process_data("Generate data for smart phones"):
397
+ # print(j)
398
+
399
+