Update Slack RAG integration with improved CSV parsing and new screenshots

- Fixed CSV message parsing in slack_mcp_reader.py to properly handle individual messages
- Updated slack_rag.py to filter empty channel strings
- Enhanced slack-setup-guide.md with two new query examples:
  - Advisor Models query: 'train black-box models to adopt to your personal data'
  - Barbarians at the Gate query: 'AI-driven research systems ADRS'
- Replaced old screenshots with four new ones showing both query examples
- Updated documentation to use User OAuth Token (xoxp-) instead of Bot Token (xoxb-)
- Added proper command examples with --no-concatenate-conversations and --force-rebuild flags
This commit is contained in:
aakash
2025-10-18 22:25:16 -07:00
parent 73ffc3cc37
commit fb9405e99a
9 changed files with 162 additions and 61 deletions

View File

@@ -203,6 +203,7 @@ class SlackMCPReader:
# Common tool names might be: 'get_messages', 'list_messages', 'fetch_channel_history'
tools = await self.list_available_tools()
logger.info(f"Available tools: {[tool.get('name') for tool in tools]}")
message_tool = None
# Look for a tool that can fetch messages - prioritize conversations_history
@@ -213,6 +214,7 @@ class SlackMCPReader:
tool_name = tool.get("name", "").lower()
if "conversations_history" in tool_name:
message_tool = tool
logger.info(f"Found conversations_history tool: {tool}")
break
# If not found, look for other message-fetching tools
@@ -230,7 +232,7 @@ class SlackMCPReader:
raise RuntimeError("No message fetching tool found in MCP server")
# Prepare tool call parameters
tool_params = {"limit": limit}
tool_params = {"limit": "180d"} # Use 180 days to get older messages
if channel:
# For conversations_history, use channel_id parameter
if message_tool["name"] == "conversations_history":
@@ -241,6 +243,8 @@ class SlackMCPReader:
tool_params[param_name] = channel
break
logger.info(f"Tool parameters: {tool_params}")
fetch_request = {
"jsonrpc": "2.0",
"id": 3,
@@ -261,8 +265,8 @@ class SlackMCPReader:
try:
messages = json.loads(content["text"])
except json.JSONDecodeError:
# If not JSON, treat as plain text
messages = [{"text": content["text"], "channel": channel or "unknown"}]
# If not JSON, try to parse as CSV format (Slack MCP server format)
messages = self._parse_csv_messages(content["text"], channel)
else:
messages = result["content"]
else:
@@ -271,6 +275,56 @@ class SlackMCPReader:
return messages if isinstance(messages, list) else [messages]
def _parse_csv_messages(self, csv_text: str, channel: str) -> list[dict[str, Any]]:
"""Parse CSV format messages from Slack MCP server."""
import csv
import io
messages = []
try:
# Split by lines and process each line as a CSV row
lines = csv_text.strip().split("\n")
if not lines:
return messages
# Skip header line if it exists
start_idx = 0
if lines[0].startswith("MsgID,UserID,UserName"):
start_idx = 1
for line in lines[start_idx:]:
if not line.strip():
continue
# Parse CSV line
reader = csv.reader(io.StringIO(line))
try:
row = next(reader)
if len(row) >= 7: # Ensure we have enough columns
message = {
"ts": row[0],
"user": row[1],
"username": row[2],
"real_name": row[3],
"channel": row[4],
"thread_ts": row[5],
"text": row[6],
"time": row[7] if len(row) > 7 else "",
"reactions": row[8] if len(row) > 8 else "",
"cursor": row[9] if len(row) > 9 else "",
}
messages.append(message)
except Exception as e:
logger.warning(f"Failed to parse CSV line: {line[:100]}... Error: {e}")
continue
except Exception as e:
logger.warning(f"Failed to parse CSV messages: {e}")
# Fallback: treat entire text as one message
messages = [{"text": csv_text, "channel": channel or "unknown"}]
return messages
def _format_message(self, message: dict[str, Any]) -> str:
"""Format a single message for indexing."""
text = message.get("text", "")
@@ -342,6 +396,40 @@ class SlackMCPReader:
return "\n".join(content_parts)
async def get_all_channels(self) -> list[str]:
"""Get list of all available channels."""
try:
channels_list_request = {
"jsonrpc": "2.0",
"id": 4,
"method": "tools/call",
"params": {"name": "channels_list", "arguments": {}},
}
channels_response = await self.send_mcp_request(channels_list_request)
if "result" in channels_response:
result = channels_response["result"]
if "content" in result and isinstance(result["content"], list):
content = result["content"][0] if result["content"] else {}
if "text" in content:
# Parse the channels from the response
channels = []
lines = content["text"].split("\n")
for line in lines:
if line.strip() and ("#" in line or "C" in line[:10]):
# Extract channel ID or name
parts = line.split()
for part in parts:
if part.startswith("C") and len(part) > 5:
channels.append(part)
elif part.startswith("#"):
channels.append(part[1:]) # Remove #
logger.info(f"Found {len(channels)} channels: {channels}")
return channels
return []
except Exception as e:
logger.warning(f"Failed to get channels list: {e}")
return []
async def read_slack_data(self, channels: Optional[list[str]] = None) -> list[str]:
"""
Read Slack data and return formatted text chunks.
@@ -378,36 +466,33 @@ class SlackMCPReader:
logger.warning(f"Failed to fetch messages from channel {channel}: {e}")
continue
else:
# Fetch from all available channels/conversations
# This is a simplified approach - real implementation would need to
# discover available channels first
try:
messages = await self.fetch_slack_messages(limit=1000)
if messages:
# Group messages by channel if concatenating
if self.concatenate_conversations:
channel_messages = {}
for message in messages:
channel = message.get(
"channel", message.get("channel_name", "general")
)
if channel not in channel_messages:
channel_messages[channel] = []
channel_messages[channel].append(message)
# Fetch from all available channels
logger.info("Fetching from all available channels...")
all_channels = await self.get_all_channels()
# Create concatenated content for each channel
for channel, msgs in channel_messages.items():
text_content = self._create_concatenated_content(msgs, channel)
if not all_channels:
# Fallback to common channel names if we can't get the list
all_channels = ["general", "random", "announcements", "C0GN5BX0F"]
logger.info(f"Using fallback channels: {all_channels}")
for channel in all_channels:
try:
logger.info(f"Searching channel: {channel}")
messages = await self.fetch_slack_messages(channel=channel, limit=1000)
if messages:
if self.concatenate_conversations:
text_content = self._create_concatenated_content(messages, channel)
if text_content.strip():
all_texts.append(text_content)
else:
# Process individual messages
for message in messages:
formatted_msg = self._format_message(message)
if formatted_msg.strip():
all_texts.append(formatted_msg)
except Exception as e:
logger.error(f"Failed to fetch messages: {e}")
else:
# Process individual messages
for message in messages:
formatted_msg = self._format_message(message)
if formatted_msg.strip():
all_texts.append(formatted_msg)
except Exception as e:
logger.warning(f"Failed to fetch messages from channel {channel}: {e}")
continue
return all_texts

View File

@@ -146,8 +146,11 @@ class SlackMCPRAG(BaseRAGExample):
if args.workspace_name:
print(f"Workspace: {args.workspace_name}")
if args.channels:
print(f"Channels: {', '.join(args.channels)}")
# Filter out empty strings from channels
channels = [ch for ch in args.channels if ch.strip()] if args.channels else None
if channels:
print(f"Channels: {', '.join(channels)}")
else:
print("Fetching from all available channels")
@@ -166,7 +169,7 @@ class SlackMCPRAG(BaseRAGExample):
retry_delay=args.retry_delay,
)
texts = await reader.read_slack_data(channels=args.channels)
texts = await reader.read_slack_data(channels=channels)
if not texts:
print("No messages found! This could mean:")

View File

@@ -120,22 +120,36 @@ python -m apps.slack_rag \
--query "What did we discuss about the project?"
```
### 4.3 Real RAG Query Example
### 4.3 Real RAG Query Examples
This example demonstrates a successful Slack RAG integration query against the Sky Lab Computing workspace's "random" channel. The system successfully retrieves actual conversation messages and performs semantic search with high relevance scores, including finding specific LEANN announcements.
This section demonstrates successful Slack RAG integration queries against the Sky Lab Computing workspace's "random" channel. The system successfully retrieves actual conversation messages and performs semantic search with high relevance scores, including finding specific research paper announcements and technical discussions.
**Key Features Demonstrated:**
- **Real Slack Integration**: Successfully connects to Slack via MCP server
- **Actual Message Retrieval**: Fetches real conversation history including specific announcements
- **Working RAG Pipeline**: Complete index building, search, and response generation
- **High Relevance Search**: Successfully finds and retrieves LEANN announcement messages
- **Challenging Query**: Demonstrates ability to find specific content within conversation history
- **High Relevance Search**: Successfully finds and retrieves specific research paper messages
- **Individual Message Processing**: Demonstrates ability to find specific content within conversation history
### Screenshots
### Example 1: Advisor Models Query
![Sky Random RAG - Real Slack Integration](videos/slack_integration.png)
**Query:** "train black-box models to adopt to your personal data"
![Sky Random RAG - Real Slack Integration Results](videos/slack_integration_2.png)
This query demonstrates the system's ability to find specific research announcements about training black-box models for personal data adaptation.
![Advisor Models Query - Setup](videos/slack_integration_1.png)
![Advisor Models Query - Results](videos/slack_integration_1.2.png)
### Example 2: Barbarians at the Gate Query
**Query:** "AI-driven research systems ADRS"
This query demonstrates the system's ability to find specific research announcements about AI-driven research systems and algorithm discovery.
![Barbarians Query - Setup](videos/slack_integration_2.1.png)
![Barbarians Query - Results](videos/slack_integration_2.2.png)
### Prerequisites
@@ -147,39 +161,38 @@ This example demonstrates a successful Slack RAG integration query against the S
1) Set the workspace token for this shell
```bash
export SLACK_MCP_XOXP_TOKEN="xoxb-***-redacted-***"
export SLACK_MCP_XOXP_TOKEN="xoxp-***-redacted-***"
```
2) Run a real query against the "random" channel by channel ID (C0GN5BX0F)
2) Run queries against the "random" channel by channel ID (C0GN5BX0F)
**Advisor Models Query:**
```bash
python -m apps.slack_rag \
--mcp-server "slack-mcp-server" \
--workspace-name "Sky Lab Computing" \
--channels C0GN5BX0F \
--max-messages-per-channel 1000 \
--query "What is LEANN about?"
--max-messages-per-channel 100000 \
--query "train black-box models to adopt to your personal data" \
--llm simulated \
--no-concatenate-conversations \
--force-rebuild
```
Expected: The system should retrieve the LEANN announcement message from Yichuan Wang, which contains:
```
Yichuan Wang Aug 8th at 3:27 PM
We'd like to share LEANN — a local RAG system with a 97% smaller index that lets you chat with all your emails, file system, and more. It's fully Claude Codecompatible via a built-in semantic search MCP server.
:loudspeaker: Tweet: https://x.com/YichuanM/status/1953886752240013803 (reposts appreciated :raised_hands:)
:computer: Code: https://github.com/yichuan-w/LEANN (stars/shares welcome) (edited)
X (formerly Twitter)X (formerly Twitter)
Yichuan Wang (@YichuanM) on X
1/N :rocket: Launching LEANN — the tiniest vector index on Earth!
Fast, accurate, and 100% private RAG on your MacBook.
0% internet. 97% smaller. Semantic search on everything.
Your personal Jarvis, ready to dive into your emails, chats, and more.
:link: Code: https://t.co/QwkYx1t0oa
**Barbarians at the Gate Query:**
```bash
python -m apps.slack_rag \
--mcp-server "slack-mcp-server" \
--workspace-name "Sky Lab Computing" \
--channels C0GN5BX0F \
--max-messages-per-channel 100000 \
--query "AI-driven research systems ADRS" \
--llm simulated \
--no-concatenate-conversations \
--force-rebuild
```
This demonstrates the system's ability to find and retrieve specific announcements about LEANN from the conversation history.
These examples demonstrate the system's ability to find and retrieve specific research announcements and technical discussions from the conversation history, showcasing the power of semantic search in Slack data.
3) Optional: Ask a broader question

View File

Binary file not shown.

Before

Width:  |  Height:  |  Size: 379 KiB

View File

Binary file not shown.

After

Width:  |  Height:  |  Size: 367 KiB

View File

Binary file not shown.

After

Width:  |  Height:  |  Size: 422 KiB

View File

Binary file not shown.

After

Width:  |  Height:  |  Size: 398 KiB

View File

Binary file not shown.

After

Width:  |  Height:  |  Size: 421 KiB

View File

Binary file not shown.

Before

Width:  |  Height:  |  Size: 345 KiB