|
|
|
""" |
|
@Date : 2024/01/24 15:11:27 |
|
@Author : orange-crow |
|
@File : crawl_webpage.py |
|
""" |
|
|
|
from metagpt.roles.di.data_interpreter import DataInterpreter |
|
|
|
PAPER_LIST_REQ = """" |
|
Get data from `paperlist` table in https://papercopilot.com/statistics/iclr-statistics/iclr-2024-statistics/, |
|
and save it to a csv file. paper title must include `multiagent` or `large language model`. *notice: print key variables* |
|
""" |
|
|
|
ECOMMERCE_REQ = """ |
|
Get products data from website https://scrapeme.live/shop/ and save it as a csv file. |
|
**Notice: Firstly parse the web page encoding and the text HTML structure; |
|
The first page product name, price, product URL, and image URL must be saved in the csv;** |
|
""" |
|
|
|
NEWS_36KR_REQ = """从36kr创投平台https://pitchhub.36kr.com/financing-flash 所有初创企业融资的信息, **注意: 这是一个中文网站**; |
|
下面是一个大致流程, 你会根据每一步的运行结果对当前计划中的任务做出适当调整: |
|
1. 爬取并本地保存html结构; |
|
2. 直接打印第7个*`快讯`*关键词后2000个字符的html内容, 作为*快讯的html内容示例*; |
|
3. 反思*快讯的html内容示例*中的规律, 设计正则匹配表达式来获取*`快讯`*的标题、链接、时间; |
|
4. 筛选最近3天的初创企业融资*`快讯`*, 以list[dict]形式打印前5个。 |
|
5. 将全部结果存在本地csv中 |
|
""" |
|
|
|
|
|
async def main(): |
|
di = DataInterpreter(tools=["scrape_web_playwright"]) |
|
|
|
await di.run(ECOMMERCE_REQ) |
|
|
|
|
|
if __name__ == "__main__": |
|
import asyncio |
|
|
|
asyncio.run(main()) |
|
|