Category: crewai

  • Crewai Apify Tool

    from crewai import Agent, Task, Crew
    import os
    from apify_client import ApifyClient
    from langchain.tools import tool
    from typing_extensions import Annotated
    
    client = ApifyClient("Enter your Apify key here")
    
    @tool("Web Scraper Tool")
    def web_scraper_tool(url: Annotated[str, "https://example.com/"]) -> Annotated[str, "Scraped content"]:
        """Web Scraper loads Start URLs in the browser and executes Page function on each page to extract data from it."""
    
        run_input = {
            "runMode": "DEVELOPMENT",
            "startUrls": [{ "url": url }],
            "linkSelector": "a[href]",
            "globs": [{ "glob": "https://example.com/*" }],
            "pseudoUrls": [],
            "excludes": [{ "glob": "/**/*.{png,jpg,jpeg,pdf}" }],
            "pageFunction": """// The function accepts a single argument: the "context" object.
            // see https://apify.com/apify/web-scraper#page-function
            async function pageFunction(context) {
                // This statement works as a breakpoint when you're trying to debug your code. Works only with Run mode: DEVELOPMENT!
                // debugger;
                const $ = context.jQuery;
                const pageTitle = $('title').first().text();
                const h1 = $('h1').first().text();
                const first_h2 = $('h2').first().text();
                const random_text_from_the_page = $('p').first().text();
                // Print some information to actor log
                context.log.info(`URL: ${context.request.url}, TITLE: ${pageTitle}`);
    
                // Manually add a new page to the queue for scraping.
                await context.enqueueRequest({ url: context.request.url });
    
                return {
                    url: context.request.url,
                    pageTitle,
                    h1,
                    first_h2,
                    random_text_from_the_page
                };
            }""",
            "proxyConfiguration": { "useApifyProxy": True },
            "initialCookies": [],
            "waitUntil": ["networkidle2"],
            "preNavigationHooks": """// We need to return array of (possibly async) functions here.
            // and "gotoOptions".
            [
                async (crawlingContext, gotoOptions) => {
                    // ...
                },
            ]
            """,
            "postNavigationHooks": """// We need to return array of (possibly async) functions here.
            // The functions accept a single argument: the "crawlingContext" object.
            [
                async (crawlingContext) => {
                    // ...
                },
            ]""",
            "breakpointLocation": "NONE",
        }
        # Run the Actor and wait for it to finish
        run = client.actor("apify/web-scraper").call(run_input=run_input)
    
        # Fetch and print Actor results from the run's dataset (if there are any)
        #print("Check your data here: https://console.apify.com/storage/datasets/" + run["defaultDatasetId"])
        text_data = ""
        for item in client.dataset(run["defaultDatasetId"]).iterate_items():
            text_data += str(item) + "\n"
        return text_data
    
    # Create the web scraper agent
    web_scraper_agent = Agent(
        role='Web Scraper',
        goal='Effectively scrape data from websites for your company',
        backstory='''You are an expert web scraper. Your job is to scrape all the data for
                    your company from a given website.
                    ''',
        tool=web_scraper_tool,  # Ensure tools is set correctly
        verbose=True
    )
    
    # Define the web scraper task
    web_scraper_task = Task(
        description='Scrape all the URLs on the site so your company can use it for crawling and scraping.',
        expected_output='All the content of the website listed.',
        agent=web_scraper_agent,
        output_file='data.txt'
    )
    
    # Assemble the crew
    crew = Crew(
        agents=[web_scraper_agent],
        tasks=[web_scraper_task],
        verbose=2,
    )
    
    # Execute tasks
    result = crew.kickoff()
    print(result)
    
    # Save the result to a file
    with open('results.txt', 'w') as f:
        f.write(result)