Examples

This page provides practical examples of how to use DataVerse ChatBot for various use cases.

Basic Usage

Setting Up a Simple RAG Chatbot

import asyncio
from chatbot.rag.openai_rag import OpenAIRAG
from chatbot.utils.paths import WEB_CONTENT_DIR, INDEXES_DIR
from uuid import uuid4

# Path to your content
content_path = WEB_CONTENT_DIR / "mycontent.txt"

# Create a unique user ID
user_id = str(uuid4())

# Initialize the RAG system
rag = OpenAIRAG(
    content_path=content_path,
    index_path=INDEXES_DIR,
    model_name="gpt-3.5-turbo-0125",
    chunking_type="recursive",
    rerank=True



# Function to chat with the bot
async def chat():
    while True:
        query = input("You: ")
        if query.lower() in ["exit", "quit"]:
            break

        response = await rag.get_response(query, user_id)
        print(f"Bot: {response}")

# Run the chat loop
if __name__ == "__main__":
    asyncio.run(chat())

Web Crawling and Content Extraction

import asyncio
import tldextract
from chatbot.crawler import Crawler
from chatbot.utils.paths import WEB_CONTENT_DIR

async def crawl_website(url, max_depth=2):
    # Extract domain name from URL
    domain_name = tldextract.extract(url).domain

    # Initialize crawler
    crawler = Crawler(
        base_url=url,
        domain_name=domain_name,
        client="crawl4ai"  # or "scrapegraph"



    # Extract content
    content_path = await crawler.extract_content(
        link=url,
        webpage_only=False,  # Crawl linked pages
        max_depth=max_depth  # Crawl depth limit



    print(f"Content extracted and saved to: {content_path}")
    return content_path

if __name__ == "__main__":
    website_url = "https://example.com"
    asyncio.run(crawl_website(website_url))

File Processing

from chatbot.utils.file_loader import FileLoader
from chatbot.utils.paths import WEB_CONTENT_DIR

def process_file(file_path, output_name="extracted_content.txt"):
    # Create output path
    output_path = WEB_CONTENT_DIR / output_name

    # Initialize file loader
    loader = FileLoader(
        file_path=file_path,
        content_path=output_path,
        client="docling"  # or "langchain"



    # Extract content
    documents = loader.extract_from_file()

    if documents:
        print(f"Successfully extracted {len(documents)} documents")
        print(f"Content saved to: {output_path}")
        return output_path
    else:
        print("Failed to extract content")
        return None

if __name__ == "__main__":
    # Process a PDF file
    pdf_path = "data/training_files/document.pdf"
    process_file(pdf_path, "pdf_content.txt")

    # Process a DOCX file
    docx_path = "data/training_files/document.docx"
    process_file(docx_path, "docx_content.txt")

Advanced Usage

Using Voice Mode

import asyncio
from chatbot.voice_mode import VoiceMode
from chatbot.rag.claude_rag import ClaudeRAG
from chatbot.utils.paths import WEB_CONTENT_DIR, INDEXES_DIR
from uuid import uuid4

# Initialize voice mode
voice = VoiceMode()

# Initialize RAG
rag = ClaudeRAG(
    content_path=WEB_CONTENT_DIR / "mycontent.txt",
    index_path=INDEXES_DIR



# User ID for tracking chat history
user_id = str(uuid4())

async def voice_chat():
    print("Press Enter to start recording (5-second limit)...")
    input()

    # Record and transcribe
    wav_path = voice.start_recording()
    transcription = voice.transcribe(wav_path)

    print(f"You said: {transcription}")

    # Get response
    response = await rag.get_response(transcription, user_id)
    print(f"Bot: {response}")

    # Convert response to speech
    voice.text_to_speech(response)

if __name__ == "__main__":
    asyncio.run(voice_chat())

Custom Dataset Creation and Classifier Training

import pandas as pd
from chatbot.utils.make_dataset import create_dataset
from chatbot.utils.train_clf import train_classifier
from chatbot.utils.paths import DATASETS_DIR, MODELS_DIR

# Step 1: Create a dataset from RAG responses
def prepare_dataset():
    # Create dataset with labels (1 for uncertain, 0 for certain)
    dataset = create_dataset(
        input_file=DATASETS_DIR / "raw_responses.csv",
        output_file=DATASETS_DIR / "labeled_responses.csv",
        model_name="all-MiniLM-L6-v2"



    print(f"Dataset created with {len(dataset)} samples")
    return dataset

# Step 2: Train a classifier on the dataset
def train_uncertainty_classifier(dataset_path):
    # Train the classifier
    metrics = train_classifier(
        dataset_path=dataset_path,
        model_type="xgboost",  # or "random_forest"
        output_path=MODELS_DIR / "clf.pkl",
        test_size=0.2,
        random_state=42



    print("Classifier trained successfully")
    print(f"Accuracy: {metrics['accuracy']:.4f}")
    print(f"Precision: {metrics['precision']:.4f}")
    print(f"Recall: {metrics['recall']:.4f}")

if __name__ == "__main__":
    # Prepare dataset
    dataset = prepare_dataset()

    # Train classifier
    train_uncertainty_classifier(DATASETS_DIR / "labeled_responses.csv")

Implementing a Custom RAG

from chatbot.rag.base_rag import BaseRAG
from chatbot.embeddings.base_embedding import HuggingFaceEmbedding
from chatbot.config import Config

class CustomRAG(BaseRAG):
    """Custom RAG implementation with a local HuggingFace model."""

    def _initialize_models(self):
        """Initialize models for the RAG system."""
        # Use a local embedding model
        self.embedding_provider = HuggingFaceEmbedding(
            embedding_model="sentence-transformers/all-mpnet-base-v2",
            device="cpu"



        # Custom model configuration
        self.model_name = "custom-model"
        self.in_price = 0.0  # Free local model
        self.out_price = 0.0  # Free local model

    async def get_response(self, query, user_id):
        """Generate a response using a custom approach."""
        # Find relevant context
        context = self._find_relevant_context(query, top_k=5)

        # Create a prompt with the context
        prompt = f"Context:\n{context}\n\nQuestion: {query}"

        # ... your custom logic to generate a response ...
        # This could use a local model, rule-based system, or external API

        # For this example, just return a placeholder
        response = f"This is a custom RAG response for: {query}"

        # Add to chat history
        self.db.append_chat_history(
            user_id=user_id,
            question=query,
            answer=response,
            model_used=self.model_name,
            embedding_model_used=self.embedding_provider.embedding_model



        return response

    @classmethod
    def get_config_class(cls):
        """Return the configuration class for this RAG."""
        return Config