SPO

Sleeping

SPO / examples /di /crawl_webpage.py

add metagpt

fe5c39d verified 3 months ago

1.57 kB

	# -- encoding: utf-8 --
	"""
	@Date : 2024/01/24 15:11:27
	@Author : orange-crow
	@File : crawl_webpage.py
	"""

	from metagpt.roles.di.data_interpreter import DataInterpreter

	PAPER_LIST_REQ = """"
	Get data from `paperlist` table in https://papercopilot.com/statistics/iclr-statistics/iclr-2024-statistics/,
	and save it to a csv file. paper title must include `multiagent` or `large language model`. notice: print key variables
	"""

	ECOMMERCE_REQ = """
	Get products data from website https://scrapeme.live/shop/ and save it as a csv file.
	**Notice: Firstly parse the web page encoding and the text HTML structure;
	The first page product name, price, product URL, and image URL must be saved in the csv;**
	"""

	NEWS_36KR_REQ = """从36kr创投平台https://pitchhub.36kr.com/financing-flash 所有初创企业融资的信息, 注意: 这是一个中文网站;
	下面是一个大致流程, 你会根据每一步的运行结果对当前计划中的任务做出适当调整:
	1. 爬取并本地保存html结构;
	2. 直接打印第7个`快讯`关键词后2000个字符的html内容, 作为快讯的html内容示例;
	3. 反思快讯的html内容示例中的规律, 设计正则匹配表达式来获取`快讯`的标题、链接、时间;
	4. 筛选最近3天的初创企业融资`快讯`, 以list[dict]形式打印前5个。
	5. 将全部结果存在本地csv中
	"""


	async def main():
	di = DataInterpreter(tools=["scrape_web_playwright"])

	await di.run(ECOMMERCE_REQ)


	if __name__ == "__main__":
	import asyncio

	asyncio.run(main())