Maouu commited on
Commit
f85ff86
·
1 Parent(s): 328de20

added a scrape route

Browse files
__pycache__/app.cpython-312.pyc CHANGED
Binary files a/__pycache__/app.cpython-312.pyc and b/__pycache__/app.cpython-312.pyc differ
 
app.py CHANGED
@@ -16,6 +16,7 @@ from pathlib import Path
16
  from collections import Counter, defaultdict
17
  from utils.logger import log_request
18
  from chipsearch.main import search
 
19
 
20
  app = FastAPI()
21
 
@@ -424,3 +425,16 @@ async def chipsearch(request: Request):
424
  unique=bool(request.query_params.get("unique", False))
425
  )
426
  return data
 
 
 
 
 
 
 
 
 
 
 
 
 
 
16
  from collections import Counter, defaultdict
17
  from utils.logger import log_request
18
  from chipsearch.main import search
19
+ from scrape.main import scrape_to_markdown
20
 
21
  app = FastAPI()
22
 
 
425
  unique=bool(request.query_params.get("unique", False))
426
  )
427
  return data
428
+
429
+
430
+ @app.post("/scrape-md")
431
+ async def scrape_md(request: Request):
432
+ data = await request.json()
433
+ url = data.get("url")
434
+ if not url:
435
+ return {"error": "URL is required"}
436
+
437
+ data = scrape_to_markdown(url)
438
+
439
+ return {"markdown": data}
440
+
logs.json CHANGED
@@ -1,317 +0,0 @@
1
- [
2
- {
3
- "timestamp": "2025-04-12T11:48:38.183002",
4
- "endpoint": "/chat",
5
- "query": "vercelXaigenerate"
6
- },
7
- {
8
- "timestamp": "2025-04-12T11:49:55.976382",
9
- "endpoint": "/chat",
10
- "query": "vercelXaigenerate"
11
- },
12
- {
13
- "timestamp": "2025-04-12T11:51:54.228166",
14
- "endpoint": "/chat",
15
- "query": "vercelXaigenerate"
16
- },
17
- {
18
- "timestamp": "2025-04-12T11:59:54.231283",
19
- "endpoint": "/chat",
20
- "query": "vercelXaigenerate"
21
- },
22
- {
23
- "timestamp": "2025-04-12T12:00:40.844595",
24
- "endpoint": "/chat",
25
- "query": "vercelXaigenerate"
26
- },
27
- {
28
- "timestamp": "2025-04-12T12:00:45.093082",
29
- "endpoint": "/chat",
30
- "query": "generate"
31
- },
32
- {
33
- "timestamp": "2025-04-12T12:00:53.603673",
34
- "endpoint": "/chat",
35
- "query": "vercelXaigenerate"
36
- },
37
- {
38
- "timestamp": "2025-04-12T12:01:08.563064",
39
- "endpoint": "/chat",
40
- "query": "vercelXaigenerate"
41
- },
42
- {
43
- "timestamp": "2025-04-12T12:01:11.525946",
44
- "endpoint": "/chat",
45
- "query": "generate"
46
- },
47
- {
48
- "timestamp": "2025-04-12T12:01:14.278740",
49
- "endpoint": "/chat",
50
- "query": "vercelXaigenerate"
51
- },
52
- {
53
- "timestamp": "2025-04-12T12:01:16.915702",
54
- "endpoint": "/chat",
55
- "query": "vercelXaigenerate"
56
- },
57
- {
58
- "timestamp": "2025-04-12T12:01:30.033848",
59
- "endpoint": "/chat",
60
- "query": "groqgenerate"
61
- },
62
- {
63
- "timestamp": "2025-04-12T12:02:15.117096",
64
- "endpoint": "/chat",
65
- "query": "vercelXaigenerate"
66
- },
67
- {
68
- "timestamp": "2025-04-12T12:02:23.649030",
69
- "endpoint": "/chat",
70
- "query": "generate"
71
- },
72
- {
73
- "timestamp": "2025-04-12T12:02:28.933558",
74
- "endpoint": "/chat",
75
- "query": "groqgenerate"
76
- },
77
- {
78
- "timestamp": "2025-04-12T12:02:38.160605",
79
- "endpoint": "/chat",
80
- "query": "generate"
81
- },
82
- {
83
- "timestamp": "2025-04-12T12:02:52.076861",
84
- "endpoint": "/chat",
85
- "query": "vercelXaigenerate"
86
- },
87
- {
88
- "timestamp": "2025-04-12T12:04:02.320142",
89
- "endpoint": "/chat",
90
- "query": "generate"
91
- },
92
- {
93
- "timestamp": "2025-04-12T12:04:05.385644",
94
- "endpoint": "/chat",
95
- "query": "vercelXaigenerate"
96
- },
97
- {
98
- "timestamp": "2025-04-12T12:04:20.976615",
99
- "endpoint": "/chat",
100
- "query": "generate"
101
- },
102
- {
103
- "timestamp": "2025-04-12T12:04:24.590270",
104
- "endpoint": "/chat",
105
- "query": "groqgenerate"
106
- },
107
- {
108
- "timestamp": "2025-04-12T12:04:34.063593",
109
- "endpoint": "/chat",
110
- "query": "groqgenerate"
111
- },
112
- {
113
- "timestamp": "2025-04-12T12:04:38.292885",
114
- "endpoint": "/chat",
115
- "query": "generate"
116
- },
117
- {
118
- "timestamp": "2025-04-12T12:04:40.796568",
119
- "endpoint": "/chat",
120
- "query": "vercelXaigenerate"
121
- },
122
- {
123
- "timestamp": "2025-04-12T12:05:13.292850",
124
- "endpoint": "/chat",
125
- "query": "groqgenerate"
126
- },
127
- {
128
- "timestamp": "2025-04-12T12:05:23.222112",
129
- "endpoint": "/chat",
130
- "query": "groqgenerate"
131
- },
132
- {
133
- "timestamp": "2025-04-12T12:05:28.215528",
134
- "endpoint": "/chat",
135
- "query": "groqgenerate"
136
- },
137
- {
138
- "timestamp": "2025-04-12T12:05:32.617464",
139
- "endpoint": "/chat",
140
- "query": "vercelXaigenerate"
141
- },
142
- {
143
- "timestamp": "2025-04-12T12:06:42.995442",
144
- "endpoint": "/chat",
145
- "query": "generate"
146
- },
147
- {
148
- "timestamp": "2025-04-12T12:06:44.829603",
149
- "endpoint": "/chat",
150
- "query": "groqgenerate"
151
- },
152
- {
153
- "timestamp": "2025-04-12T12:06:52.601184",
154
- "endpoint": "/chat",
155
- "query": "groqgenerate"
156
- },
157
- {
158
- "timestamp": "2025-04-12T12:06:56.874556",
159
- "endpoint": "/chat",
160
- "query": "vercelXaigenerate"
161
- },
162
- {
163
- "timestamp": "2025-04-12T12:12:39.624859",
164
- "endpoint": "/chat",
165
- "query": "groqgenerate"
166
- },
167
- {
168
- "timestamp": "2025-04-12T12:12:44.753091",
169
- "endpoint": "/chat",
170
- "query": "generate"
171
- },
172
- {
173
- "timestamp": "2025-04-12T12:12:49.680739",
174
- "endpoint": "/chat",
175
- "query": "vercelXaigenerate"
176
- },
177
- {
178
- "timestamp": "2025-04-12T12:12:57.229754",
179
- "endpoint": "/chat",
180
- "query": "groqgenerate"
181
- },
182
- {
183
- "timestamp": "2025-04-12T12:13:00.767253",
184
- "endpoint": "/chat",
185
- "query": "generate"
186
- },
187
- {
188
- "timestamp": "2025-04-12T12:13:03.882938",
189
- "endpoint": "/chat",
190
- "query": "groqgenerate"
191
- },
192
- {
193
- "timestamp": "2025-04-12T12:13:07.323707",
194
- "endpoint": "/chat",
195
- "query": "vercelXaigenerate"
196
- },
197
- {
198
- "timestamp": "2025-04-12T12:13:15.612926",
199
- "endpoint": "/chat",
200
- "query": "generate"
201
- },
202
- {
203
- "timestamp": "2025-04-12T12:13:17.961841",
204
- "endpoint": "/chat",
205
- "query": "groqgenerate"
206
- },
207
- {
208
- "timestamp": "2025-04-12T12:13:22.519624",
209
- "endpoint": "/chat",
210
- "query": "groqgenerate"
211
- },
212
- {
213
- "timestamp": "2025-04-12T12:13:31.175155",
214
- "endpoint": "/chat",
215
- "query": "vercelXaigenerate"
216
- },
217
- {
218
- "timestamp": "2025-04-12T12:13:38.396994",
219
- "endpoint": "/chat",
220
- "query": "groqgenerate"
221
- },
222
- {
223
- "timestamp": "2025-04-12T12:13:42.230876",
224
- "endpoint": "/chat",
225
- "query": "generate"
226
- },
227
- {
228
- "timestamp": "2025-04-12T12:13:45.023945",
229
- "endpoint": "/chat",
230
- "query": "groqgenerate"
231
- },
232
- {
233
- "timestamp": "2025-04-12T12:13:48.831325",
234
- "endpoint": "/chat",
235
- "query": "generate"
236
- },
237
- {
238
- "timestamp": "2025-04-12T12:13:51.526062",
239
- "endpoint": "/chat",
240
- "query": "groqgenerate"
241
- },
242
- {
243
- "timestamp": "2025-04-12T12:13:56.833256",
244
- "endpoint": "/chat",
245
- "query": "groqgenerate"
246
- },
247
- {
248
- "timestamp": "2025-04-12T12:14:02.725289",
249
- "endpoint": "/chat",
250
- "query": "vercelXaigenerate"
251
- },
252
- {
253
- "timestamp": "2025-04-12T12:14:12.979970",
254
- "endpoint": "/chat",
255
- "query": "generate"
256
- },
257
- {
258
- "timestamp": "2025-04-12T12:14:34.965307",
259
- "endpoint": "/chat",
260
- "query": "vercelGroqgenerate"
261
- },
262
- {
263
- "timestamp": "2025-04-12T12:14:41.576413",
264
- "endpoint": "/chat",
265
- "query": "vercelGroqgenerate"
266
- },
267
- {
268
- "timestamp": "2025-04-12T12:15:29.506084",
269
- "endpoint": "/chat",
270
- "query": "vercelGroqgenerate"
271
- },
272
- {
273
- "timestamp": "2025-04-12T12:17:00.139827",
274
- "endpoint": "/chat",
275
- "query": "vercelGroqgenerate"
276
- },
277
- {
278
- "timestamp": "2025-04-12T12:18:07.735115",
279
- "endpoint": "/chat",
280
- "query": "vercelGroqgenerate"
281
- },
282
- {
283
- "timestamp": "2025-04-12T12:19:21.687931",
284
- "endpoint": "/chat",
285
- "query": "vercelGroqgenerate"
286
- },
287
- {
288
- "timestamp": "2025-04-12T12:22:30.826896",
289
- "endpoint": "/chat",
290
- "query": "vercelGroqgenerate"
291
- },
292
- {
293
- "timestamp": "2025-04-12T12:22:33.378368",
294
- "endpoint": "/chat",
295
- "query": "vercelGroqgenerate"
296
- },
297
- {
298
- "timestamp": "2025-04-12T12:22:35.429037",
299
- "endpoint": "/chat",
300
- "query": "vercelGroqgenerate"
301
- },
302
- {
303
- "timestamp": "2025-04-12T12:22:37.216791",
304
- "endpoint": "/chat",
305
- "query": "vercelGroqgenerate"
306
- },
307
- {
308
- "timestamp": "2025-04-12T12:23:02.229775",
309
- "endpoint": "/chat",
310
- "query": "vercelGroqgenerate"
311
- },
312
- {
313
- "timestamp": "2025-04-12T12:23:41.708982",
314
- "endpoint": "/chat",
315
- "query": "vercelGroqgenerate"
316
- }
317
- ]
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
requirements.txt CHANGED
@@ -9,4 +9,5 @@ jinja2
9
  aiofiles
10
  matplotlib
11
  curl_cffi
12
- beautifulsoup4
 
 
9
  aiofiles
10
  matplotlib
11
  curl_cffi
12
+ beautifulsoup4
13
+ html2text
scrape/__pycache__/main.cpython-312.pyc ADDED
Binary file (1.03 kB). View file
 
scrape/main.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ from curl_cffi import requests as req
2
+ from bs4 import BeautifulSoup
3
+ import html2text
4
+
5
+ def scrape_to_markdown(url):
6
+ """
7
+ Scrapes a webpage and converts its content to markdown format.
8
+
9
+ Args:
10
+ url (str): The URL of the webpage to scrape
11
+
12
+ Returns:
13
+ str: The webpage content converted to markdown
14
+ """
15
+ # Fetch HTML content
16
+ response = req.get(url, impersonate='chrome110')
17
+ soup = BeautifulSoup(response.text, 'html.parser')
18
+
19
+ # Clean up unwanted tags
20
+ for tag in soup(['script', 'style', 'noscript', 'svg', 'css']):
21
+ tag.decompose()
22
+
23
+ # Extract cleaned HTML
24
+ clean_html = str(soup)
25
+
26
+ # Convert to Markdown
27
+ markdown = html2text.html2text(clean_html)
28
+
29
+ return markdown
test.py CHANGED
@@ -1,50 +1,22 @@
1
- import requests
2
- import json
 
3
 
4
- # import requests
5
- # import json
6
 
7
- # url = "http://127.0.0.1:8000/chat"
 
 
8
 
9
- # payload = {
10
- # "model": "your-model-name",
11
- # "message": "Create a json object of top 10 anime! answer in json only!",
12
- # "messages": []
13
- # }
14
 
15
- # headers = {
16
- # "Content-Type": "application/json"
17
- # }
18
 
19
- # # Send streaming POST request
20
- # with requests.post(url, data=json.dumps(payload), headers=headers, stream=True) as response:
21
- # if response.status_code == 200:
22
- # for line in response.iter_lines(decode_unicode=True):
23
- # if line and line.startswith('data: '):
24
- # try:
25
- # # Remove 'data: ' prefix and parse JSON
26
- # json_data = json.loads(line[6:])
27
- # # Extract text from choices if available
28
- # if json_data.get('choices') and len(json_data['choices']) > 0:
29
- # text = json_data['choices'][0].get('text', '')
30
- # if text:
31
- # print(text, end='')
32
- # except json.JSONDecodeError:
33
- # continue
34
- # else:
35
- # print("Error:", response.status_code, response.text)
36
 
37
-
38
- def search_chips(term, num_results=10, advanced=False, unique=False):
39
- url = f"http://127.0.0.1:8000/chipsearch?term={term}&num_results={num_results}&advanced={advanced}&unique={unique}"
40
-
41
- try:
42
- response = requests.post(url)
43
- response.raise_for_status()
44
- return response.json()
45
- except requests.exceptions.RequestException as e:
46
- print(f"Error: {e}")
47
- return None
48
-
49
- results = search_chips("top 10 anime of all time")
50
- print(results)
 
1
+ from curl_cffi import requests as req
2
+ from bs4 import BeautifulSoup
3
+ import html2text
4
 
5
+ url = 'https://www.firecrawl.dev/'
 
6
 
7
+ # Fetch HTML content
8
+ response = req.get(url)
9
+ soup = BeautifulSoup(response.text, 'html.parser')
10
 
11
+ # Optional: Clean up unwanted tags
12
+ for tag in soup(['script', 'style', 'noscript', 'svg']):
13
+ tag.decompose()
 
 
14
 
15
+ # Extract cleaned HTML
16
+ clean_html = str(soup)
 
17
 
18
+ # Convert to Markdown
19
+ markdown = html2text.html2text(clean_html)
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
20
 
21
+ # Output
22
+ print(markdown)