fromcrewaiimportAgent,Task,Crewimportosfromapify_clientimportApifyClientfromlangchain.toolsimporttoolfromtyping_extensionsimportAnnotatedclient = ApifyClient("Enter your Apify key here")@tool("Web Scraper Tool")defweb_scraper_tool(url: Annotated[str,"https://example.com/"]) -> Annotated[str,"Scraped content"]:"""Web Scraper loads Start URLs in the browser and executes Page function on each page to extract data from it."""run_input = { "runMode": "DEVELOPMENT", "startUrls": [{ "url": url}],"linkSelector": "a[href]","globs": [{ "glob": "https://example.com/*" }], "pseudoUrls": [], "excludes": [{ "glob": "/**/*.{png,jpg,jpeg,pdf}" }],"pageFunction": """// The function accepts a single argument: the "context" object.// see https://apify.com/apify/web-scraper#page-functionasyncfunctionpageFunction(context) {// This statement works as a breakpoint when you're trying to debug your code. Works only with Run mode: DEVELOPMENT!// debugger;const$ = context.jQuery;constpageTitle = $('title').first().text();consth1 = $('h1').first().text();constfirst_h2 = $('h2').first().text();constrandom_text_from_the_page = $('p').first().text();// Print some information to actor logcontext.log.info(`URL: ${context.request.url},TITLE: ${pageTitle}`); // Manually add a new page to the queue for scraping. await context.enqueueRequest({ url: context.request.url }); return { url: context.request.url, pageTitle, h1, first_h2, random_text_from_the_page }; }""", "proxyConfiguration": { "useApifyProxy": True }, "initialCookies": [], "waitUntil": ["networkidle2"], "preNavigationHooks": """// We need to return array of (possibly async) functions here. // and "gotoOptions". [ async (crawlingContext, gotoOptions) => { // ... }, ] """, "postNavigationHooks": """// We need to return array of (possibly async) functions here. // The functions accept a single argument: the "crawlingContext" object. [ async (crawlingContext) => { // ... }, ]""", "breakpointLocation": "NONE", } # Run the Actor and wait for it to finish run = client.actor("apify/web-scraper").call(run_input=run_input) # Fetch and print Actor results from the run's dataset (if there are any) #print("Check your data here: https://console.apify.com/storage/datasets/" + run["defaultDatasetId"]) text_data = "" for item in client.dataset(run["defaultDatasetId"]).iterate_items(): text_data += str(item) + "\n" return text_data# Create the web scraper agentweb_scraper_agent = Agent( role='Web Scraper', goal='Effectively scrape data from websites for your company', backstory='''You are an expert web scraper. Your job is to scrape all the data for your company from a given website. ''', tool=web_scraper_tool, # Ensure tools is set correctly verbose=True)# Define the web scraper taskweb_scraper_task = Task( description='Scrape all the URLs on the site so your company can use it for crawling and scraping.', expected_output='All the content of the website listed.', agent=web_scraper_agent, output_file='data.txt')# Assemble the crewcrew = Crew( agents=[web_scraper_agent], tasks=[web_scraper_task], verbose=2,)# Execute tasksresult = crew.kickoff()print(result)# Save the result to a filewith open('results.txt', 'w') as f: f.write(result)