Examples ======== This page provides practical examples of how to use DataVerse ChatBot for various use cases. Basic Usage ----------- Setting Up a Simple RAG Chatbot ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: python import asyncio from chatbot.rag.openai_rag import OpenAIRAG from chatbot.utils.paths import WEB_CONTENT_DIR, INDEXES_DIR from uuid import uuid4 # Path to your content content_path = WEB_CONTENT_DIR / "mycontent.txt" # Create a unique user ID user_id = str(uuid4()) # Initialize the RAG system rag = OpenAIRAG( content_path=content_path, index_path=INDEXES_DIR, model_name="gpt-3.5-turbo-0125", chunking_type="recursive", rerank=True # Function to chat with the bot async def chat(): while True: query = input("You: ") if query.lower() in ["exit", "quit"]: break response = await rag.get_response(query, user_id) print(f"Bot: {response}") # Run the chat loop if __name__ == "__main__": asyncio.run(chat()) Web Crawling and Content Extraction ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: python import asyncio import tldextract from chatbot.crawler import Crawler from chatbot.utils.paths import WEB_CONTENT_DIR async def crawl_website(url, max_depth=2): # Extract domain name from URL domain_name = tldextract.extract(url).domain # Initialize crawler crawler = Crawler( base_url=url, domain_name=domain_name, client="crawl4ai" # or "scrapegraph" # Extract content content_path = await crawler.extract_content( link=url, webpage_only=False, # Crawl linked pages max_depth=max_depth # Crawl depth limit print(f"Content extracted and saved to: {content_path}") return content_path if __name__ == "__main__": website_url = "https://example.com" asyncio.run(crawl_website(website_url)) File Processing ~~~~~~~~~~~~~~~~~~ .. code-block:: python from chatbot.utils.file_loader import FileLoader from chatbot.utils.paths import WEB_CONTENT_DIR def process_file(file_path, output_name="extracted_content.txt"): # Create output path output_path = WEB_CONTENT_DIR / output_name # Initialize file loader loader = FileLoader( file_path=file_path, content_path=output_path, client="docling" # or "langchain" # Extract content documents = loader.extract_from_file() if documents: print(f"Successfully extracted {len(documents)} documents") print(f"Content saved to: {output_path}") return output_path else: print("Failed to extract content") return None if __name__ == "__main__": # Process a PDF file pdf_path = "data/training_files/document.pdf" process_file(pdf_path, "pdf_content.txt") # Process a DOCX file docx_path = "data/training_files/document.docx" process_file(docx_path, "docx_content.txt") Advanced Usage ----------------- Using Voice Mode ~~~~~~~~~~~~~~~~~~~ .. code-block:: python import asyncio from chatbot.voice_mode import VoiceMode from chatbot.rag.claude_rag import ClaudeRAG from chatbot.utils.paths import WEB_CONTENT_DIR, INDEXES_DIR from uuid import uuid4 # Initialize voice mode voice = VoiceMode() # Initialize RAG rag = ClaudeRAG( content_path=WEB_CONTENT_DIR / "mycontent.txt", index_path=INDEXES_DIR # User ID for tracking chat history user_id = str(uuid4()) async def voice_chat(): print("Press Enter to start recording (5-second limit)...") input() # Record and transcribe wav_path = voice.start_recording() transcription = voice.transcribe(wav_path) print(f"You said: {transcription}") # Get response response = await rag.get_response(transcription, user_id) print(f"Bot: {response}") # Convert response to speech voice.text_to_speech(response) if __name__ == "__main__": asyncio.run(voice_chat()) Custom Dataset Creation and Classifier Training ~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: python import pandas as pd from chatbot.utils.make_dataset import create_dataset from chatbot.utils.train_clf import train_classifier from chatbot.utils.paths import DATASETS_DIR, MODELS_DIR # Step 1: Create a dataset from RAG responses def prepare_dataset(): # Create dataset with labels (1 for uncertain, 0 for certain) dataset = create_dataset( input_file=DATASETS_DIR / "raw_responses.csv", output_file=DATASETS_DIR / "labeled_responses.csv", model_name="all-MiniLM-L6-v2" print(f"Dataset created with {len(dataset)} samples") return dataset # Step 2: Train a classifier on the dataset def train_uncertainty_classifier(dataset_path): # Train the classifier metrics = train_classifier( dataset_path=dataset_path, model_type="xgboost", # or "random_forest" output_path=MODELS_DIR / "clf.pkl", test_size=0.2, random_state=42 print("Classifier trained successfully") print(f"Accuracy: {metrics['accuracy']:.4f}") print(f"Precision: {metrics['precision']:.4f}") print(f"Recall: {metrics['recall']:.4f}") if __name__ == "__main__": # Prepare dataset dataset = prepare_dataset() # Train classifier train_uncertainty_classifier(DATASETS_DIR / "labeled_responses.csv") Implementing a Custom RAG ~~~~~~~~~~~~~~~~~~~~~~~~~~~~ .. code-block:: python from chatbot.rag.base_rag import BaseRAG from chatbot.embeddings.base_embedding import HuggingFaceEmbedding from chatbot.config import Config class CustomRAG(BaseRAG): """Custom RAG implementation with a local HuggingFace model.""" def _initialize_models(self): """Initialize models for the RAG system.""" # Use a local embedding model self.embedding_provider = HuggingFaceEmbedding( embedding_model="sentence-transformers/all-mpnet-base-v2", device="cpu" # Custom model configuration self.model_name = "custom-model" self.in_price = 0.0 # Free local model self.out_price = 0.0 # Free local model async def get_response(self, query, user_id): """Generate a response using a custom approach.""" # Find relevant context context = self._find_relevant_context(query, top_k=5) # Create a prompt with the context prompt = f"Context:\n{context}\n\nQuestion: {query}" # ... your custom logic to generate a response ... # This could use a local model, rule-based system, or external API # For this example, just return a placeholder response = f"This is a custom RAG response for: {query}" # Add to chat history self.db.append_chat_history( user_id=user_id, question=query, answer=response, model_used=self.model_name, embedding_model_used=self.embedding_provider.embedding_model return response @classmethod def get_config_class(cls): """Return the configuration class for this RAG.""" return Config