from langchain_community.tools import WikipediaQueryRun
from langchain_community.utilities import WikipediaAPIWrapper
from langchain_core.pydantic_v1 import BaseModel, Field
class WikiInputs(BaseModel):
"""Inputs to the wikipedia tool."""
query: str = Field(
description="query to look up in Wikipedia, should be 3 or less words"
)
api_wrapper = WikipediaAPIWrapper(api_key="Enter your key here")
tool = WikipediaQueryRun(
name="wiki-tool",
description="look up things in wikipedia",
args_schema=WikiInputs,
api_wrapper=api_wrapper,
return_direct=True,
)
print(tool.run("Who is Dwayne Johnson?"))Blog
-
Wikipedia Tool
-
Exa Tools
from exa_py import Exa from praisonai_tools import BaseTool class ExaSearch: name: str = "ExaSearch" description: str = "Perform a search using this ExaSearch tool and returns search results with url" def __init__(self, api_key: str): self.exa = Exa(api_key=api_key) def run(self, query: str): results = self.exa.search_and_contents( query, text={"include_html_tags": True, "max_characters": 1000}, ) return results class ExaSimilar: name: str = "ExaSimilar" description: str = "Search for webpages similar to a given URL using ExaSimilar tool" def __init__(self, api_key: str): self.exa = Exa(api_key=api_key) def run(self, url: str): """Search for webpages similar to a given URL. The url passed in should be a URL returned from `search`. """ results = self.exa.find_similar(url, num_results=3) return results class ExaContents: name: str = "ExaContents" description: str = "Get the contents of a webpage using a list of urls using ExaContents tool" def __init__(self, api_key: str): self.exa = Exa(api_key=api_key) def run(self, ids: list): """Get the contents of a webpage. The ids must be passed in as a list, a list of ids returned from `search`. """ contents = self.exa.get_contents(ids) contents_str = str(contents) split_contents = contents_str.split("URL:") trimmed_contents = [content[:1000] for content in split_contents] return "\n\n".join(trimmed_contents) # Example usage if __name__ == "__main__": api_key = "Enter your exa key" tool = ExaSearch(api_key=api_key) search_query = "latest AI News" search_results = tool.run(search_query) print("Search Results:", search_results) # Find similar webpages find_similar_url = "https://boomi.com/" # Valid URL similar_results = tool._run_similar(find_similar_url) print("Similar Results:", similar_results) # Get contents using ids content_ids = ["tesla.com"] # Replace with actual IDs contents = tool._run_get_contents(content_ids) print("Contents:", contents) -
Crewai Apify Tool
from crewai import Agent, Task, Crew import os from apify_client import ApifyClient from langchain.tools import tool from typing_extensions import Annotated client = ApifyClient("Enter your Apify key here") @tool("Web Scraper Tool") def web_scraper_tool(url: Annotated[str, "https://example.com/"]) -> Annotated[str, "Scraped content"]: """Web Scraper loads Start URLs in the browser and executes Page function on each page to extract data from it.""" run_input = { "runMode": "DEVELOPMENT", "startUrls": [{ "url": url }], "linkSelector": "a[href]", "globs": [{ "glob": "https://example.com/*" }], "pseudoUrls": [], "excludes": [{ "glob": "/**/*.{png,jpg,jpeg,pdf}" }], "pageFunction": """// The function accepts a single argument: the "context" object. // see https://apify.com/apify/web-scraper#page-function async function pageFunction(context) { // This statement works as a breakpoint when you're trying to debug your code. Works only with Run mode: DEVELOPMENT! // debugger; const $ = context.jQuery; const pageTitle = $('title').first().text(); const h1 = $('h1').first().text(); const first_h2 = $('h2').first().text(); const random_text_from_the_page = $('p').first().text(); // Print some information to actor log context.log.info(`URL: ${context.request.url}, TITLE: ${pageTitle}`); // Manually add a new page to the queue for scraping. await context.enqueueRequest({ url: context.request.url }); return { url: context.request.url, pageTitle, h1, first_h2, random_text_from_the_page }; }""", "proxyConfiguration": { "useApifyProxy": True }, "initialCookies": [], "waitUntil": ["networkidle2"], "preNavigationHooks": """// We need to return array of (possibly async) functions here. // and "gotoOptions". [ async (crawlingContext, gotoOptions) => { // ... }, ] """, "postNavigationHooks": """// We need to return array of (possibly async) functions here. // The functions accept a single argument: the "crawlingContext" object. [ async (crawlingContext) => { // ... }, ]""", "breakpointLocation": "NONE", } # Run the Actor and wait for it to finish run = client.actor("apify/web-scraper").call(run_input=run_input) # Fetch and print Actor results from the run's dataset (if there are any) #print("Check your data here: https://console.apify.com/storage/datasets/" + run["defaultDatasetId"]) text_data = "" for item in client.dataset(run["defaultDatasetId"]).iterate_items(): text_data += str(item) + "\n" return text_data # Create the web scraper agent web_scraper_agent = Agent( role='Web Scraper', goal='Effectively scrape data from websites for your company', backstory='''You are an expert web scraper. Your job is to scrape all the data for your company from a given website. ''', tool=web_scraper_tool, # Ensure tools is set correctly verbose=True ) # Define the web scraper task web_scraper_task = Task( description='Scrape all the URLs on the site so your company can use it for crawling and scraping.', expected_output='All the content of the website listed.', agent=web_scraper_agent, output_file='data.txt' ) # Assemble the crew crew = Crew( agents=[web_scraper_agent], tasks=[web_scraper_task], verbose=2, ) # Execute tasks result = crew.kickoff() print(result) # Save the result to a file with open('results.txt', 'w') as f: f.write(result) -
Exa Search Tool
from exa_py import Exa from praisonai_tools import BaseTool class ExaSearchTool: name: str = "ExaSearchTool" description: str = "Perform a search using Exa and retrieve contents with specified options" def __init__(self, api_key: str): self.exa = Exa(api_key="Enter your key here") def _run(self, query: str): results = self.exa.search_and_contents( query, text={"include_html_tags": True, "max_characters": 1000}, ) return results # Example usage if __name__ == "__main__": api_key = "ExaSearchTool" tool = ExaSearchTool(api_key=api_key) search_query = "recent midjourney news" results = tool._run(search_query) print(results) -
AutoGen Scraping
! pip install -qqq pyautogen apify-client import os import openai config_list = [ {"model": "gpt-3.5-turbo", "api_key": "Enter your api key"}, ] from apify_client import ApifyClient from typing_extensions import Annotated def scrape_page(url: Annotated[str, "https://example.com/"]) -> Annotated[str, "Scraped content"]: # Initialize the ApifyClient with your API token client = ApifyClient(token="Enter your apify key") # Prepare the Actor input run_input = { "startUrls": [{"url": url}], "useSitemaps": False, "crawlerType": "playwright:firefox", "includeUrlGlobs": [], "excludeUrlGlobs": [], "ignoreCanonicalUrl": False, "maxCrawlDepth": 0, "maxCrawlPages": 4, "initialConcurrency": 0, "maxConcurrency": 200, "initialCookies": [], "proxyConfiguration": {"useApifyProxy": True}, "maxSessionRotations": 10, "maxRequestRetries": 5, "requestTimeoutSecs": 60, "dynamicContentWaitSecs": 10, "maxScrollHeightPixels": 5000, "removeElementsCssSelector": """nav, footer, script, style, noscript, svg, [role=\"alert\"], [role=\"banner\"], [role=\"dialog\"], [role=\"alertdialog\"], [role=\"region\"][aria-label*=\"skip\" i], [aria-modal=\"true\"]""", "removeCookieWarnings": True, "clickElementsCssSelector": '[aria-expanded="false"]', "htmlTransformer": "readableText", "readableTextCharThreshold": 100, "aggressivePrune": False, "debugMode": True, "debugLog": True, "saveHtml": True, "saveMarkdown": True, "saveFiles": False, "saveScreenshots": False, "maxResults": 9999999, "clientSideMinChangePercentage": 15, "renderingTypeDetectionPercentage": 10, } # Run the Actor and wait for it to finish run = client.actor("aYG0l9s7dbB7j3gbS").call(run_input=run_input) # Fetch and print Actor results from the run's dataset (if there are any) text_data = "" for item in client.dataset(run["defaultDatasetId"]).iterate_items(): text_data += item.get("text", "") + "\n" average_token = 0.75 max_tokens = 20000 # slightly less than max to be safe 32k text_data = text_data[: int(average_token * max_tokens)] return text_data from autogen import ConversableAgent, register_function # Create web scrapper agent. scraper_agent = ConversableAgent( "WebScraper", llm_config={"config_list": config_list}, system_message="You are a web scrapper and you can scrape any web page using the tools provided. " "Returns 'TERMINATE' when the scraping is done.", ) # Create user proxy agent. user_proxy_agent = ConversableAgent( "UserProxy", llm_config=False, # No LLM for this agent. human_input_mode="NEVER", code_execution_config=False, # No code execution for this agent. is_termination_msg=lambda x: x.get("content", "") is not None and "terminate" in x["content"].lower(), default_auto_reply="Please continue if not finished, otherwise return 'TERMINATE'.", ) # Register the function with the agents. register_function( scrape_page, caller=scraper_agent, executor=user_proxy_agent, name="scrape_page", description="Scrape a web page and return the content.", ) chat_result = user_proxy_agent.initiate_chat( scraper_agent, message="Can you scrape https://example.com/ for me?", summary_method="reflection_with_llm", summary_args={ "summary_prompt": """Summarize the scraped content and format summary EXACTLY as follows: --- *Website*: `https://example.com/` --- *content*: `[CONTENT GOES HERE]` --- """ }, ) print(chat_result)