* Add ty type checker to CI and fix type errors - Add ty (Astral's fast Python type checker) to GitHub CI workflow - Fix type annotations across all RAG apps: - Update load_data return types from list[str] to list[dict[str, Any]] - Fix base_rag_example.py to properly handle dict format from create_text_chunks - Fix type errors in leann-core: - chunking_utils.py: Add explicit type annotations - cli.py: Fix return type annotations for PDF extraction functions - interactive_utils.py: Fix readline import type handling - Fix type errors in apps: - wechat_history.py: Fix return type annotations - document_rag.py, code_rag.py: Replace **kwargs with explicit arguments - Add ty configuration to pyproject.toml This resolves the bug introduced in PR #157 where create_text_chunks() changed to return list[dict] but callers were not updated. * Fix remaining ty type errors - Fix slack_mcp_reader.py channel parameter can be None - Fix embedding_compute.py ContextProp type issue - Fix searcher_base.py method override signatures - Fix chunking_utils.py chunk_text assignment - Fix slack_rag.py and twitter_rag.py return types - Fix email.py and image_rag.py method overrides * Fix multimodal benchmark scripts type errors - Fix undefined LeannRetriever -> LeannMultiVector - Add proper type casts for HuggingFace Dataset iteration - Cast task config values to correct types - Add type annotations for dataset row dicts * Enable ty check for multimodal scripts in CI All type errors in multimodal scripts have been fixed, so we can now include them in the CI type checking. * Fix all test type errors and enable ty check on tests - Fix test_basic.py: search() takes str not list - Fix test_cli_prompt_template.py: add type: ignore for Mock assignments - Fix test_prompt_template_persistence.py: match BaseSearcher.search signature - Fix test_prompt_template_e2e.py: add type narrowing asserts after skip - Fix test_readme_examples.py: use explicit kwargs instead of **model_args - Fix metadata_filter.py: allow Optional[MetadataFilters] - Update CI to run ty check on tests * Format code with ruff * Format searcher_base.py
230 lines
8.0 KiB
Python
230 lines
8.0 KiB
Python
#!/usr/bin/env python3
|
|
"""
|
|
Slack RAG Application with MCP Support
|
|
|
|
This application enables RAG (Retrieval-Augmented Generation) on Slack messages
|
|
by connecting to Slack MCP servers to fetch live data and index it in LEANN.
|
|
|
|
Usage:
|
|
python -m apps.slack_rag --mcp-server "slack-mcp-server" --query "What did the team discuss about the project?"
|
|
"""
|
|
|
|
import argparse
|
|
import asyncio
|
|
from typing import Any
|
|
|
|
from apps.base_rag_example import BaseRAGExample
|
|
from apps.slack_data.slack_mcp_reader import SlackMCPReader
|
|
|
|
|
|
class SlackMCPRAG(BaseRAGExample):
|
|
"""
|
|
RAG application for Slack messages via MCP servers.
|
|
|
|
This class provides a complete RAG pipeline for Slack data, including
|
|
MCP server connection, data fetching, indexing, and interactive chat.
|
|
"""
|
|
|
|
def __init__(self):
|
|
super().__init__(
|
|
name="Slack MCP RAG",
|
|
description="RAG application for Slack messages via MCP servers",
|
|
default_index_name="slack_messages",
|
|
)
|
|
|
|
def _add_specific_arguments(self, parser: argparse.ArgumentParser):
|
|
"""Add Slack MCP-specific arguments."""
|
|
parser.add_argument(
|
|
"--mcp-server",
|
|
type=str,
|
|
required=True,
|
|
help="Command to start the Slack MCP server (e.g., 'slack-mcp-server' or 'npx slack-mcp-server')",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--workspace-name",
|
|
type=str,
|
|
help="Slack workspace name for better organization and filtering",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--channels",
|
|
nargs="+",
|
|
help="Specific Slack channels to index (e.g., general random). If not specified, fetches from all available channels",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--concatenate-conversations",
|
|
action="store_true",
|
|
default=True,
|
|
help="Group messages by channel/thread for better context (default: True)",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--no-concatenate-conversations",
|
|
action="store_true",
|
|
help="Process individual messages instead of grouping by channel",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--max-messages-per-channel",
|
|
type=int,
|
|
default=100,
|
|
help="Maximum number of messages to include per channel (default: 100)",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--test-connection",
|
|
action="store_true",
|
|
help="Test MCP server connection and list available tools without indexing",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--max-retries",
|
|
type=int,
|
|
default=5,
|
|
help="Maximum number of retries for failed operations (default: 5)",
|
|
)
|
|
|
|
parser.add_argument(
|
|
"--retry-delay",
|
|
type=float,
|
|
default=2.0,
|
|
help="Initial delay between retries in seconds (default: 2.0)",
|
|
)
|
|
|
|
async def test_mcp_connection(self, args) -> bool:
|
|
"""Test the MCP server connection and display available tools."""
|
|
print(f"Testing connection to MCP server: {args.mcp_server}")
|
|
|
|
try:
|
|
reader = SlackMCPReader(
|
|
mcp_server_command=args.mcp_server,
|
|
workspace_name=args.workspace_name,
|
|
concatenate_conversations=not args.no_concatenate_conversations,
|
|
max_messages_per_conversation=args.max_messages_per_channel,
|
|
max_retries=args.max_retries,
|
|
retry_delay=args.retry_delay,
|
|
)
|
|
|
|
async with reader:
|
|
tools = await reader.list_available_tools()
|
|
|
|
print("Successfully connected to MCP server!")
|
|
print(f"Available tools ({len(tools)}):")
|
|
|
|
for i, tool in enumerate(tools, 1):
|
|
name = tool.get("name", "Unknown")
|
|
description = tool.get("description", "No description available")
|
|
print(f"\n{i}. {name}")
|
|
print(
|
|
f" Description: {description[:100]}{'...' if len(description) > 100 else ''}"
|
|
)
|
|
|
|
# Show input schema if available
|
|
schema = tool.get("inputSchema", {})
|
|
if schema.get("properties"):
|
|
props = list(schema["properties"].keys())[:3] # Show first 3 properties
|
|
print(
|
|
f" Parameters: {', '.join(props)}{'...' if len(schema['properties']) > 3 else ''}"
|
|
)
|
|
|
|
return True
|
|
|
|
except Exception as e:
|
|
print(f"Failed to connect to MCP server: {e}")
|
|
print("\nTroubleshooting tips:")
|
|
print("1. Make sure the MCP server is installed and accessible")
|
|
print("2. Check if the server command is correct")
|
|
print("3. Ensure you have proper authentication/credentials configured")
|
|
print("4. Try running the MCP server command directly to test it")
|
|
return False
|
|
|
|
async def load_data(self, args) -> list[dict[str, Any]]:
|
|
"""Load Slack messages via MCP server."""
|
|
print(f"Connecting to Slack MCP server: {args.mcp_server}")
|
|
|
|
if args.workspace_name:
|
|
print(f"Workspace: {args.workspace_name}")
|
|
|
|
# Filter out empty strings from channels
|
|
channels = [ch for ch in args.channels if ch.strip()] if args.channels else None
|
|
|
|
if channels:
|
|
print(f"Channels: {', '.join(channels)}")
|
|
else:
|
|
print("Fetching from all available channels")
|
|
|
|
concatenate = not args.no_concatenate_conversations
|
|
print(
|
|
f"Processing mode: {'Concatenated conversations' if concatenate else 'Individual messages'}"
|
|
)
|
|
|
|
try:
|
|
reader = SlackMCPReader(
|
|
mcp_server_command=args.mcp_server,
|
|
workspace_name=args.workspace_name,
|
|
concatenate_conversations=concatenate,
|
|
max_messages_per_conversation=args.max_messages_per_channel,
|
|
max_retries=args.max_retries,
|
|
retry_delay=args.retry_delay,
|
|
)
|
|
|
|
texts = await reader.read_slack_data(channels=channels)
|
|
|
|
if not texts:
|
|
print("No messages found! This could mean:")
|
|
print("- The MCP server couldn't fetch messages")
|
|
print("- The specified channels don't exist or are empty")
|
|
print("- Authentication issues with the Slack workspace")
|
|
return []
|
|
|
|
print(f"Successfully loaded {len(texts)} text chunks from Slack")
|
|
|
|
# Show sample of what was loaded
|
|
if texts:
|
|
sample_text = texts[0][:200] + "..." if len(texts[0]) > 200 else texts[0]
|
|
print("\nSample content:")
|
|
print("-" * 40)
|
|
print(sample_text)
|
|
print("-" * 40)
|
|
|
|
# Convert strings to dict format expected by base class
|
|
return [{"text": text, "metadata": {"source": "slack"}} for text in texts]
|
|
|
|
except Exception as e:
|
|
print(f"Error loading Slack data: {e}")
|
|
print("\nThis might be due to:")
|
|
print("- MCP server connection issues")
|
|
print("- Authentication problems")
|
|
print("- Network connectivity issues")
|
|
print("- Incorrect channel names")
|
|
raise
|
|
|
|
async def run(self):
|
|
"""Main entry point with MCP connection testing."""
|
|
args = self.parser.parse_args()
|
|
|
|
# Test connection if requested
|
|
if args.test_connection:
|
|
success = await self.test_mcp_connection(args)
|
|
if not success:
|
|
return
|
|
print(
|
|
"MCP server is working! You can now run without --test-connection to start indexing."
|
|
)
|
|
return
|
|
|
|
# Run the standard RAG pipeline
|
|
await super().run()
|
|
|
|
|
|
async def main():
|
|
"""Main entry point for the Slack MCP RAG application."""
|
|
app = SlackMCPRAG()
|
|
await app.run()
|
|
|
|
|
|
if __name__ == "__main__":
|
|
asyncio.run(main())
|