Leebana – Skills, Education and Experience

phi3_local_rag

ollama run phi3

ollama pull nomic-embed-text

pip install langchain_experimental

indexer.py


from langchain_experimental.text_splitter import SemanticChunker
from langchain_text_splitters import RecursiveCharacterTextSplitter


from langchain_community.document_loaders import DirectoryLoader
from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma

# Load documents from a directory
loader = DirectoryLoader("./places_transcripts", glob="**/*.txt")

print("dir loaded loader")

documents = loader.load()

print(len(documents))

# # Create embeddingsclear
embeddings = OllamaEmbeddings(model="nomic-embed-text", show_progress=True)

# # Create Semantic Text Splitter
# text_splitter = SemanticChunker(embeddings, breakpoint_threshold_type="interquartile")

text_splitter = RecursiveCharacterTextSplitter(
    chunk_size=1500,
    chunk_overlap=300,
    add_start_index=True,
)

# # Split documents into chunks
texts = text_splitter.split_documents(documents)

# # Create vector store
vectorstore = Chroma.from_documents(
    documents=texts, 
    embedding= embeddings,
    persist_directory="./db-place")

print("vectorstore created")

ollama_phi3_rag.py



from langchain_community.embeddings import OllamaEmbeddings
from langchain_community.vectorstores import Chroma
from langchain_community.chat_models import ChatOllama

from langchain.prompts import ChatPromptTemplate
from langchain.schema.runnable import RunnablePassthrough
from langchain.schema.output_parser import StrOutputParser


# # Create embeddingsclear
embeddings = OllamaEmbeddings(model="nomic-embed-text", show_progress=False)

db = Chroma(persist_directory="./db-place",
            embedding_function=embeddings)

# # Create retriever
retriever = db.as_retriever(
    search_type="similarity",
    search_kwargs= {"k": 5}
)

# # Create Ollama language model - Gemma 2
local_llm = 'phi3'

llm = ChatOllama(model=local_llm,
                 keep_alive="3h", 
                 max_tokens=512,  
                 temperature=0)

# Create prompt template
template = """<bos><start_of_turn>user\nAnswer the question based only on the following context and extract out a meaningful answer. \
Please write in full sentences with correct spelling and punctuation. if it makes sense use lists. \
If the context doen't contain the answer, just respond that you are unable to find an answer. \

CONTEXT: {context}

QUESTION: {question}

<end_of_turn>
<start_of_turn>model\n
ANSWER:"""
prompt = ChatPromptTemplate.from_template(template)

# Create the RAG chain using LCEL with prompt printing and streaming output
rag_chain = (
    {"context": retriever, "question": RunnablePassthrough()}
    | prompt
    | llm
)

# Function to ask questions
def ask_question(question):
    print("Answer:\n\n", end=" ", flush=True)
    for chunk in rag_chain.stream(question):
        print(chunk.content, end="", flush=True)
    print("\n")

# Example usage
if __name__ == "__main__":
    while True:
        user_question = input("Ask a question (or type 'quit' to exit): ")
        if user_question.lower() == 'quit':
            break
        answer = ask_question(user_question)
        # print("\nFull answer received.\n")

Open Weather Tool

pip install llama-index-tools-weather

pip install pyowm

from llama_index.tools.weather import OpenWeatherMapToolSpec  # Adjusted for possible correct path
from llama_index.agent.openai import OpenAIAgent

# Create the tool specification with your API key
tool_spec = OpenWeatherMapToolSpec(key="Enter your key here")

# Initialize the OpenAIAgent with the tool specification
agent = OpenAIAgent.from_tools(tool_spec.to_tool_list())

# Query the agent about the weather
response = agent.chat("What is the temperature like in London?")
print(response)

Adding weather to praisonai using llama_index

pip install praisonai

tools.py file

from llama_index.tools.weather import OpenWeatherMapToolSpec
from praisonai_tools import BaseTool
from llama_index.agent.openai import OpenAIAgent
import os

class WeatherTool(BaseTool):
    name: str = "Weather Tool"
    description: str = "Get the current weather information for a specified location"

    def _run(self, location: str):
        # Use your API key from the environment variable

        api_key = os.getenv("OPENWEATHERMAP_API_KEY")
        tool_spec = OpenWeatherMapToolSpec(key=api_key)
        agent = OpenAIAgent.from_tools(tool_spec.to_tool_list())
        response = agent.chat(f"What is the temperature like in {location}?")
        return response

praisonai --init What is the weather in Paris tomorrow

the above command creates the agents.yaml file
Open the agents.yaml file and add the tool eg: tool – WeatherTool

Wikipedia Tool

from langchain_community.tools import WikipediaQueryRun
from langchain_community.utilities import WikipediaAPIWrapper
from langchain_core.pydantic_v1 import BaseModel, Field


class WikiInputs(BaseModel):
    """Inputs to the wikipedia tool."""

    query: str = Field(
        description="query to look up in Wikipedia, should be 3 or less words"
    )


api_wrapper = WikipediaAPIWrapper(api_key="Enter your key here")

tool = WikipediaQueryRun(
    name="wiki-tool",
    description="look up things in wikipedia",
    args_schema=WikiInputs,
    api_wrapper=api_wrapper,
    return_direct=True,
)

print(tool.run("Who is Dwayne Johnson?"))

Exa Tools

from exa_py import Exa
from praisonai_tools import BaseTool

class ExaSearch:
    name: str = "ExaSearch"
    description: str = "Perform a search using this ExaSearch tool and returns search results with url"

    def __init__(self, api_key: str):
        self.exa = Exa(api_key=api_key)

    def run(self, query: str):
        results = self.exa.search_and_contents(
            query,
            text={"include_html_tags": True, "max_characters": 1000},
        )
        return results


class ExaSimilar:
    name: str = "ExaSimilar"
    description: str = "Search for webpages similar to a given URL using ExaSimilar tool"

    def __init__(self, api_key: str):
        self.exa = Exa(api_key=api_key)

    def run(self, url: str):
        """Search for webpages similar to a given URL.
        The url passed in should be a URL returned from `search`.
        """
        results = self.exa.find_similar(url, num_results=3)
        return results


class ExaContents:
    name: str = "ExaContents"
    description: str = "Get the contents of a webpage using a list of urls using ExaContents tool"

    def __init__(self, api_key: str):
        self.exa = Exa(api_key=api_key)

    def run(self, ids: list):
        """Get the contents of a webpage.
        The ids must be passed in as a list, a list of ids returned from `search`.
        """
        contents = self.exa.get_contents(ids)
        contents_str = str(contents)
        split_contents = contents_str.split("URL:")
        trimmed_contents = [content[:1000] for content in split_contents]
        return "\n\n".join(trimmed_contents)

# Example usage
if __name__ == "__main__":
    api_key = "Enter your exa key"
    tool = ExaSearch(api_key=api_key)
    search_query = "latest AI News"
    search_results = tool.run(search_query)
    print("Search Results:", search_results)

    # Find similar webpages
    find_similar_url = "https://boomi.com/"  # Valid URL
    similar_results = tool._run_similar(find_similar_url)
    print("Similar Results:", similar_results)

    # Get contents using ids
    content_ids = ["tesla.com"]  # Replace with actual IDs
    contents = tool._run_get_contents(content_ids)
    print("Contents:", contents)

Crewai Apify Tool

from crewai import Agent, Task, Crew
import os
from apify_client import ApifyClient
from langchain.tools import tool
from typing_extensions import Annotated

client = ApifyClient("Enter your Apify key here")

@tool("Web Scraper Tool")
def web_scraper_tool(url: Annotated[str, "https://example.com/"]) -> Annotated[str, "Scraped content"]:
    """Web Scraper loads Start URLs in the browser and executes Page function on each page to extract data from it."""

    run_input = {
        "runMode": "DEVELOPMENT",
        "startUrls": [{ "url": url }],
        "linkSelector": "a[href]",
        "globs": [{ "glob": "https://example.com/*" }],
        "pseudoUrls": [],
        "excludes": [{ "glob": "/**/*.{png,jpg,jpeg,pdf}" }],
        "pageFunction": """// The function accepts a single argument: the "context" object.
        // see https://apify.com/apify/web-scraper#page-function
        async function pageFunction(context) {
            // This statement works as a breakpoint when you're trying to debug your code. Works only with Run mode: DEVELOPMENT!
            // debugger;
            const $ = context.jQuery;
            const pageTitle = $('title').first().text();
            const h1 = $('h1').first().text();
            const first_h2 = $('h2').first().text();
            const random_text_from_the_page = $('p').first().text();
            // Print some information to actor log
            context.log.info(`URL: ${context.request.url}, TITLE: ${pageTitle}`);

            // Manually add a new page to the queue for scraping.
            await context.enqueueRequest({ url: context.request.url });

            return {
                url: context.request.url,
                pageTitle,
                h1,
                first_h2,
                random_text_from_the_page
            };
        }""",
        "proxyConfiguration": { "useApifyProxy": True },
        "initialCookies": [],
        "waitUntil": ["networkidle2"],
        "preNavigationHooks": """// We need to return array of (possibly async) functions here.
        // and "gotoOptions".
        [
            async (crawlingContext, gotoOptions) => {
                // ...
            },
        ]
        """,
        "postNavigationHooks": """// We need to return array of (possibly async) functions here.
        // The functions accept a single argument: the "crawlingContext" object.
        [
            async (crawlingContext) => {
                // ...
            },
        ]""",
        "breakpointLocation": "NONE",
    }
    # Run the Actor and wait for it to finish
    run = client.actor("apify/web-scraper").call(run_input=run_input)

    # Fetch and print Actor results from the run's dataset (if there are any)
    #print("Check your data here: https://console.apify.com/storage/datasets/" + run["defaultDatasetId"])
    text_data = ""
    for item in client.dataset(run["defaultDatasetId"]).iterate_items():
        text_data += str(item) + "\n"
    return text_data

# Create the web scraper agent
web_scraper_agent = Agent(
    role='Web Scraper',
    goal='Effectively scrape data from websites for your company',
    backstory='''You are an expert web scraper. Your job is to scrape all the data for
                your company from a given website.
                ''',
    tool=web_scraper_tool,  # Ensure tools is set correctly
    verbose=True
)

# Define the web scraper task
web_scraper_task = Task(
    description='Scrape all the URLs on the site so your company can use it for crawling and scraping.',
    expected_output='All the content of the website listed.',
    agent=web_scraper_agent,
    output_file='data.txt'
)

# Assemble the crew
crew = Crew(
    agents=[web_scraper_agent],
    tasks=[web_scraper_task],
    verbose=2,
)

# Execute tasks
result = crew.kickoff()
print(result)

# Save the result to a file
with open('results.txt', 'w') as f:
    f.write(result)

Exa Search Tool

from exa_py import Exa
from praisonai_tools import BaseTool

class ExaSearchTool:
    name: str = "ExaSearchTool"
    description: str = "Perform a search using Exa and retrieve contents with specified options"

    def __init__(self, api_key: str):
        self.exa = Exa(api_key="Enter your key here")

    def _run(self, query: str):
        results = self.exa.search_and_contents(
            query,
            text={"include_html_tags": True, "max_characters": 1000},
        )
        return results
# Example usage
if __name__ == "__main__":
    api_key = "ExaSearchTool"
    tool = ExaSearchTool(api_key=api_key)
    search_query = "recent midjourney news"
    results = tool._run(search_query)
    print(results)

AutoGen Scraping

! pip install -qqq pyautogen apify-client

import os
import openai

config_list = [
    {"model": "gpt-3.5-turbo", "api_key": "Enter your api key"},
]

from apify_client import ApifyClient
from typing_extensions import Annotated


def scrape_page(url: Annotated[str, "https://example.com/"]) -> Annotated[str, "Scraped content"]:
    # Initialize the ApifyClient with your API token
    client = ApifyClient(token="Enter your apify key")

    # Prepare the Actor input
    run_input = {
        "startUrls": [{"url": url}],
        "useSitemaps": False,
        "crawlerType": "playwright:firefox",
        "includeUrlGlobs": [],
        "excludeUrlGlobs": [],
        "ignoreCanonicalUrl": False,
        "maxCrawlDepth": 0,
        "maxCrawlPages": 4,
        "initialConcurrency": 0,
        "maxConcurrency": 200,
        "initialCookies": [],
        "proxyConfiguration": {"useApifyProxy": True},
        "maxSessionRotations": 10,
        "maxRequestRetries": 5,
        "requestTimeoutSecs": 60,
        "dynamicContentWaitSecs": 10,
        "maxScrollHeightPixels": 5000,
        "removeElementsCssSelector": """nav, footer, script, style, noscript, svg,
    [role=\"alert\"],
    [role=\"banner\"],
    [role=\"dialog\"],
    [role=\"alertdialog\"],
    [role=\"region\"][aria-label*=\"skip\" i],
    [aria-modal=\"true\"]""",
        "removeCookieWarnings": True,
        "clickElementsCssSelector": '[aria-expanded="false"]',
        "htmlTransformer": "readableText",
        "readableTextCharThreshold": 100,
        "aggressivePrune": False,
        "debugMode": True,
        "debugLog": True,
        "saveHtml": True,
        "saveMarkdown": True,
        "saveFiles": False,
        "saveScreenshots": False,
        "maxResults": 9999999,
        "clientSideMinChangePercentage": 15,
        "renderingTypeDetectionPercentage": 10,
    }

    # Run the Actor and wait for it to finish
    run = client.actor("aYG0l9s7dbB7j3gbS").call(run_input=run_input)

    # Fetch and print Actor results from the run's dataset (if there are any)
    text_data = ""
    for item in client.dataset(run["defaultDatasetId"]).iterate_items():
        text_data += item.get("text", "") + "\n"

    average_token = 0.75
    max_tokens = 20000  # slightly less than max to be safe 32k
    text_data = text_data[: int(average_token * max_tokens)]
    return text_data

from autogen import ConversableAgent, register_function

# Create web scrapper agent.
scraper_agent = ConversableAgent(
    "WebScraper",
    llm_config={"config_list": config_list},
    system_message="You are a web scrapper and you can scrape any web page using the tools provided. "
    "Returns 'TERMINATE' when the scraping is done.",
)

# Create user proxy agent.
user_proxy_agent = ConversableAgent(
    "UserProxy",
    llm_config=False,  # No LLM for this agent.
    human_input_mode="NEVER",
    code_execution_config=False,  # No code execution for this agent.
    is_termination_msg=lambda x: x.get("content", "") is not None and "terminate" in x["content"].lower(),
    default_auto_reply="Please continue if not finished, otherwise return 'TERMINATE'.",
)

# Register the function with the agents.
register_function(
    scrape_page,
    caller=scraper_agent,
    executor=user_proxy_agent,
    name="scrape_page",
    description="Scrape a web page and return the content.",
)

chat_result = user_proxy_agent.initiate_chat(
    scraper_agent,
    message="Can you scrape https://example.com/ for me?",
    summary_method="reflection_with_llm",
    summary_args={
        "summary_prompt": """Summarize the scraped content and format summary EXACTLY as follows:
---
*Website*:
`https://example.com/`
---
*content*:
`[CONTENT GOES HERE]`
---
"""
    },
)

print(chat_result)