Category: AutoGen

  • AutoGen Scraping

    ! pip install -qqq pyautogen apify-client
    
    import os
    import openai
    
    config_list = [
        {"model": "gpt-3.5-turbo", "api_key": "Enter your api key"},
    ]
    
    from apify_client import ApifyClient
    from typing_extensions import Annotated
    
    
    def scrape_page(url: Annotated[str, "https://example.com/"]) -> Annotated[str, "Scraped content"]:
        # Initialize the ApifyClient with your API token
        client = ApifyClient(token="Enter your apify key")
    
        # Prepare the Actor input
        run_input = {
            "startUrls": [{"url": url}],
            "useSitemaps": False,
            "crawlerType": "playwright:firefox",
            "includeUrlGlobs": [],
            "excludeUrlGlobs": [],
            "ignoreCanonicalUrl": False,
            "maxCrawlDepth": 0,
            "maxCrawlPages": 4,
            "initialConcurrency": 0,
            "maxConcurrency": 200,
            "initialCookies": [],
            "proxyConfiguration": {"useApifyProxy": True},
            "maxSessionRotations": 10,
            "maxRequestRetries": 5,
            "requestTimeoutSecs": 60,
            "dynamicContentWaitSecs": 10,
            "maxScrollHeightPixels": 5000,
            "removeElementsCssSelector": """nav, footer, script, style, noscript, svg,
        [role=\"alert\"],
        [role=\"banner\"],
        [role=\"dialog\"],
        [role=\"alertdialog\"],
        [role=\"region\"][aria-label*=\"skip\" i],
        [aria-modal=\"true\"]""",
            "removeCookieWarnings": True,
            "clickElementsCssSelector": '[aria-expanded="false"]',
            "htmlTransformer": "readableText",
            "readableTextCharThreshold": 100,
            "aggressivePrune": False,
            "debugMode": True,
            "debugLog": True,
            "saveHtml": True,
            "saveMarkdown": True,
            "saveFiles": False,
            "saveScreenshots": False,
            "maxResults": 9999999,
            "clientSideMinChangePercentage": 15,
            "renderingTypeDetectionPercentage": 10,
        }
    
        # Run the Actor and wait for it to finish
        run = client.actor("aYG0l9s7dbB7j3gbS").call(run_input=run_input)
    
        # Fetch and print Actor results from the run's dataset (if there are any)
        text_data = ""
        for item in client.dataset(run["defaultDatasetId"]).iterate_items():
            text_data += item.get("text", "") + "\n"
    
        average_token = 0.75
        max_tokens = 20000  # slightly less than max to be safe 32k
        text_data = text_data[: int(average_token * max_tokens)]
        return text_data
    
    from autogen import ConversableAgent, register_function
    
    # Create web scrapper agent.
    scraper_agent = ConversableAgent(
        "WebScraper",
        llm_config={"config_list": config_list},
        system_message="You are a web scrapper and you can scrape any web page using the tools provided. "
        "Returns 'TERMINATE' when the scraping is done.",
    )
    
    # Create user proxy agent.
    user_proxy_agent = ConversableAgent(
        "UserProxy",
        llm_config=False,  # No LLM for this agent.
        human_input_mode="NEVER",
        code_execution_config=False,  # No code execution for this agent.
        is_termination_msg=lambda x: x.get("content", "") is not None and "terminate" in x["content"].lower(),
        default_auto_reply="Please continue if not finished, otherwise return 'TERMINATE'.",
    )
    
    # Register the function with the agents.
    register_function(
        scrape_page,
        caller=scraper_agent,
        executor=user_proxy_agent,
        name="scrape_page",
        description="Scrape a web page and return the content.",
    )
    
    chat_result = user_proxy_agent.initiate_chat(
        scraper_agent,
        message="Can you scrape https://example.com/ for me?",
        summary_method="reflection_with_llm",
        summary_args={
            "summary_prompt": """Summarize the scraped content and format summary EXACTLY as follows:
    ---
    *Website*:
    `https://example.com/`
    ---
    *content*:
    `[CONTENT GOES HERE]`
    ---
    """
        },
    )
    
    print(chat_result)