! pip install -qqq pyautogen apify-client
import os
import openai
config_list = [
{"model": "gpt-3.5-turbo", "api_key": "Enter your api key"},
]
from apify_client import ApifyClient
from typing_extensions import Annotated
def scrape_page(url: Annotated[str, "https://example.com/"]) -> Annotated[str, "Scraped content"]:
# Initialize the ApifyClient with your API token
client = ApifyClient(token="Enter your apify key")
# Prepare the Actor input
run_input = {
"startUrls": [{"url": url}],
"useSitemaps": False,
"crawlerType": "playwright:firefox",
"includeUrlGlobs": [],
"excludeUrlGlobs": [],
"ignoreCanonicalUrl": False,
"maxCrawlDepth": 0,
"maxCrawlPages": 4,
"initialConcurrency": 0,
"maxConcurrency": 200,
"initialCookies": [],
"proxyConfiguration": {"useApifyProxy": True},
"maxSessionRotations": 10,
"maxRequestRetries": 5,
"requestTimeoutSecs": 60,
"dynamicContentWaitSecs": 10,
"maxScrollHeightPixels": 5000,
"removeElementsCssSelector": """nav, footer, script, style, noscript, svg,
[role=\"alert\"],
[role=\"banner\"],
[role=\"dialog\"],
[role=\"alertdialog\"],
[role=\"region\"][aria-label*=\"skip\" i],
[aria-modal=\"true\"]""",
"removeCookieWarnings": True,
"clickElementsCssSelector": '[aria-expanded="false"]',
"htmlTransformer": "readableText",
"readableTextCharThreshold": 100,
"aggressivePrune": False,
"debugMode": True,
"debugLog": True,
"saveHtml": True,
"saveMarkdown": True,
"saveFiles": False,
"saveScreenshots": False,
"maxResults": 9999999,
"clientSideMinChangePercentage": 15,
"renderingTypeDetectionPercentage": 10,
}
# Run the Actor and wait for it to finish
run = client.actor("aYG0l9s7dbB7j3gbS").call(run_input=run_input)
# Fetch and print Actor results from the run's dataset (if there are any)
text_data = ""
for item in client.dataset(run["defaultDatasetId"]).iterate_items():
text_data += item.get("text", "") + "\n"
average_token = 0.75
max_tokens = 20000 # slightly less than max to be safe 32k
text_data = text_data[: int(average_token * max_tokens)]
return text_data
from autogen import ConversableAgent, register_function
# Create web scrapper agent.
scraper_agent = ConversableAgent(
"WebScraper",
llm_config={"config_list": config_list},
system_message="You are a web scrapper and you can scrape any web page using the tools provided. "
"Returns 'TERMINATE' when the scraping is done.",
)
# Create user proxy agent.
user_proxy_agent = ConversableAgent(
"UserProxy",
llm_config=False, # No LLM for this agent.
human_input_mode="NEVER",
code_execution_config=False, # No code execution for this agent.
is_termination_msg=lambda x: x.get("content", "") is not None and "terminate" in x["content"].lower(),
default_auto_reply="Please continue if not finished, otherwise return 'TERMINATE'.",
)
# Register the function with the agents.
register_function(
scrape_page,
caller=scraper_agent,
executor=user_proxy_agent,
name="scrape_page",
description="Scrape a web page and return the content.",
)
chat_result = user_proxy_agent.initiate_chat(
scraper_agent,
message="Can you scrape https://example.com/ for me?",
summary_method="reflection_with_llm",
summary_args={
"summary_prompt": """Summarize the scraped content and format summary EXACTLY as follows:
---
*Website*:
`https://example.com/`
---
*content*:
`[CONTENT GOES HERE]`
---
"""
},
)
print(chat_result)