Crewai Apify Tool

from crewai import Agent, Task, Crew
import os
from apify_client import ApifyClient
from langchain.tools import tool
from typing_extensions import Annotated

client = ApifyClient("Enter your Apify key here")

@tool("Web Scraper Tool")
def web_scraper_tool(url: Annotated[str, "https://example.com/"]) -> Annotated[str, "Scraped content"]:
    """Web Scraper loads Start URLs in the browser and executes Page function on each page to extract data from it."""

    run_input = {
        "runMode": "DEVELOPMENT",
        "startUrls": [{ "url": url }],
        "linkSelector": "a[href]",
        "globs": [{ "glob": "https://example.com/*" }],
        "pseudoUrls": [],
        "excludes": [{ "glob": "/**/*.{png,jpg,jpeg,pdf}" }],
        "pageFunction": """// The function accepts a single argument: the "context" object.
        // see https://apify.com/apify/web-scraper#page-function
        async function pageFunction(context) {
            // This statement works as a breakpoint when you're trying to debug your code. Works only with Run mode: DEVELOPMENT!
            // debugger;
            const $ = context.jQuery;
            const pageTitle = $('title').first().text();
            const h1 = $('h1').first().text();
            const first_h2 = $('h2').first().text();
            const random_text_from_the_page = $('p').first().text();
            // Print some information to actor log
            context.log.info(`URL: ${context.request.url}, TITLE: ${pageTitle}`);

            // Manually add a new page to the queue for scraping.
            await context.enqueueRequest({ url: context.request.url });

            return {
                url: context.request.url,
                pageTitle,
                h1,
                first_h2,
                random_text_from_the_page
            };
        }""",
        "proxyConfiguration": { "useApifyProxy": True },
        "initialCookies": [],
        "waitUntil": ["networkidle2"],
        "preNavigationHooks": """// We need to return array of (possibly async) functions here.
        // and "gotoOptions".
        [
            async (crawlingContext, gotoOptions) => {
                // ...
            },
        ]
        """,
        "postNavigationHooks": """// We need to return array of (possibly async) functions here.
        // The functions accept a single argument: the "crawlingContext" object.
        [
            async (crawlingContext) => {
                // ...
            },
        ]""",
        "breakpointLocation": "NONE",
    }
    # Run the Actor and wait for it to finish
    run = client.actor("apify/web-scraper").call(run_input=run_input)

    # Fetch and print Actor results from the run's dataset (if there are any)
    #print("Check your data here: https://console.apify.com/storage/datasets/" + run["defaultDatasetId"])
    text_data = ""
    for item in client.dataset(run["defaultDatasetId"]).iterate_items():
        text_data += str(item) + "\n"
    return text_data

# Create the web scraper agent
web_scraper_agent = Agent(
    role='Web Scraper',
    goal='Effectively scrape data from websites for your company',
    backstory='''You are an expert web scraper. Your job is to scrape all the data for
                your company from a given website.
                ''',
    tool=web_scraper_tool,  # Ensure tools is set correctly
    verbose=True
)

# Define the web scraper task
web_scraper_task = Task(
    description='Scrape all the URLs on the site so your company can use it for crawling and scraping.',
    expected_output='All the content of the website listed.',
    agent=web_scraper_agent,
    output_file='data.txt'
)

# Assemble the crew
crew = Crew(
    agents=[web_scraper_agent],
    tasks=[web_scraper_task],
    verbose=2,
)

# Execute tasks
result = crew.kickoff()
print(result)

# Save the result to a file
with open('results.txt', 'w') as f:
    f.write(result)

Leave a comment

Your email address will not be published. Required fields are marked *