Update Slack RAG integration with improved CSV parsing and new screenshots

- Fixed CSV message parsing in slack_mcp_reader.py to properly handle individual messages - Updated slack_rag.py to filter empty channel strings - Enhanced slack-setup-guide.md with two new query examples: - Advisor Models query: 'train black-box models to adopt to your personal data' - Barbarians at the Gate query: 'AI-driven research systems ADRS' - Replaced old screenshots with four new ones showing both query examples - Updated documentation to use User OAuth Token (xoxp-) instead of Bot Token (xoxb-) - Added proper command examples with --no-concatenate-conversations and --force-rebuild flags
2025-10-18 22:25:16 -07:00
parent 73ffc3cc37
commit fb9405e99a
9 changed files with 162 additions and 61 deletions
--- a/apps/slack_data/slack_mcp_reader.py
+++ b/apps/slack_data/slack_mcp_reader.py
@@ -203,6 +203,7 @@ class SlackMCPReader:
        # Common tool names might be: 'get_messages', 'list_messages', 'fetch_channel_history'

        tools = await self.list_available_tools()
+        logger.info(f"Available tools: {[tool.get('name') for tool in tools]}")
        message_tool = None

        # Look for a tool that can fetch messages - prioritize conversations_history
@@ -213,6 +214,7 @@ class SlackMCPReader:
            tool_name = tool.get("name", "").lower()
            if "conversations_history" in tool_name:
                message_tool = tool
+                logger.info(f"Found conversations_history tool: {tool}")
                break

        # If not found, look for other message-fetching tools
@@ -230,7 +232,7 @@ class SlackMCPReader:
            raise RuntimeError("No message fetching tool found in MCP server")

        # Prepare tool call parameters
-        tool_params = {"limit": limit}
+        tool_params = {"limit": "180d"}  # Use 180 days to get older messages
        if channel:
            # For conversations_history, use channel_id parameter
            if message_tool["name"] == "conversations_history":
@@ -241,6 +243,8 @@ class SlackMCPReader:
                    tool_params[param_name] = channel
                    break

+        logger.info(f"Tool parameters: {tool_params}")
+
        fetch_request = {
            "jsonrpc": "2.0",
            "id": 3,
@@ -261,8 +265,8 @@ class SlackMCPReader:
                try:
                    messages = json.loads(content["text"])
                except json.JSONDecodeError:
-                    # If not JSON, treat as plain text
-                    messages = [{"text": content["text"], "channel": channel or "unknown"}]
+                    # If not JSON, try to parse as CSV format (Slack MCP server format)
+                    messages = self._parse_csv_messages(content["text"], channel)
            else:
                messages = result["content"]
        else:
@@ -271,6 +275,56 @@ class SlackMCPReader:

        return messages if isinstance(messages, list) else [messages]

+    def _parse_csv_messages(self, csv_text: str, channel: str) -> list[dict[str, Any]]:
+        """Parse CSV format messages from Slack MCP server."""
+        import csv
+        import io
+
+        messages = []
+        try:
+            # Split by lines and process each line as a CSV row
+            lines = csv_text.strip().split("\n")
+            if not lines:
+                return messages
+
+            # Skip header line if it exists
+            start_idx = 0
+            if lines[0].startswith("MsgID,UserID,UserName"):
+                start_idx = 1
+
+            for line in lines[start_idx:]:
+                if not line.strip():
+                    continue
+
+                # Parse CSV line
+                reader = csv.reader(io.StringIO(line))
+                try:
+                    row = next(reader)
+                    if len(row) >= 7:  # Ensure we have enough columns
+                        message = {
+                            "ts": row[0],
+                            "user": row[1],
+                            "username": row[2],
+                            "real_name": row[3],
+                            "channel": row[4],
+                            "thread_ts": row[5],
+                            "text": row[6],
+                            "time": row[7] if len(row) > 7 else "",
+                            "reactions": row[8] if len(row) > 8 else "",
+                            "cursor": row[9] if len(row) > 9 else "",
+                        }
+                        messages.append(message)
+                except Exception as e:
+                    logger.warning(f"Failed to parse CSV line: {line[:100]}... Error: {e}")
+                    continue
+
+        except Exception as e:
+            logger.warning(f"Failed to parse CSV messages: {e}")
+            # Fallback: treat entire text as one message
+            messages = [{"text": csv_text, "channel": channel or "unknown"}]
+
+        return messages
+
    def _format_message(self, message: dict[str, Any]) -> str:
        """Format a single message for indexing."""
        text = message.get("text", "")
@@ -342,6 +396,40 @@ class SlackMCPReader:

        return "\n".join(content_parts)

+    async def get_all_channels(self) -> list[str]:
+        """Get list of all available channels."""
+        try:
+            channels_list_request = {
+                "jsonrpc": "2.0",
+                "id": 4,
+                "method": "tools/call",
+                "params": {"name": "channels_list", "arguments": {}},
+            }
+            channels_response = await self.send_mcp_request(channels_list_request)
+            if "result" in channels_response:
+                result = channels_response["result"]
+                if "content" in result and isinstance(result["content"], list):
+                    content = result["content"][0] if result["content"] else {}
+                    if "text" in content:
+                        # Parse the channels from the response
+                        channels = []
+                        lines = content["text"].split("\n")
+                        for line in lines:
+                            if line.strip() and ("#" in line or "C" in line[:10]):
+                                # Extract channel ID or name
+                                parts = line.split()
+                                for part in parts:
+                                    if part.startswith("C") and len(part) > 5:
+                                        channels.append(part)
+                                    elif part.startswith("#"):
+                                        channels.append(part[1:])  # Remove #
+                        logger.info(f"Found {len(channels)} channels: {channels}")
+                        return channels
+            return []
+        except Exception as e:
+            logger.warning(f"Failed to get channels list: {e}")
+            return []
+
    async def read_slack_data(self, channels: Optional[list[str]] = None) -> list[str]:
        """
        Read Slack data and return formatted text chunks.
@@ -378,36 +466,33 @@ class SlackMCPReader:
                        logger.warning(f"Failed to fetch messages from channel {channel}: {e}")
                        continue
            else:
-                # Fetch from all available channels/conversations
-                # This is a simplified approach - real implementation would need to
-                # discover available channels first
-                try:
-                    messages = await self.fetch_slack_messages(limit=1000)
-                    if messages:
-                        # Group messages by channel if concatenating
-                        if self.concatenate_conversations:
-                            channel_messages = {}
-                            for message in messages:
-                                channel = message.get(
-                                    "channel", message.get("channel_name", "general")
-                                )
-                                if channel not in channel_messages:
-                                    channel_messages[channel] = []
-                                channel_messages[channel].append(message)
+                # Fetch from all available channels
+                logger.info("Fetching from all available channels...")
+                all_channels = await self.get_all_channels()

-                            # Create concatenated content for each channel
-                            for channel, msgs in channel_messages.items():
-                                text_content = self._create_concatenated_content(msgs, channel)
+                if not all_channels:
+                    # Fallback to common channel names if we can't get the list
+                    all_channels = ["general", "random", "announcements", "C0GN5BX0F"]
+                    logger.info(f"Using fallback channels: {all_channels}")
+
+                for channel in all_channels:
+                    try:
+                        logger.info(f"Searching channel: {channel}")
+                        messages = await self.fetch_slack_messages(channel=channel, limit=1000)
+                        if messages:
+                            if self.concatenate_conversations:
+                                text_content = self._create_concatenated_content(messages, channel)
                                if text_content.strip():
                                    all_texts.append(text_content)
-                        else:
-                            # Process individual messages
-                            for message in messages:
-                                formatted_msg = self._format_message(message)
-                                if formatted_msg.strip():
-                                    all_texts.append(formatted_msg)
-                except Exception as e:
-                    logger.error(f"Failed to fetch messages: {e}")
+                            else:
+                                # Process individual messages
+                                for message in messages:
+                                    formatted_msg = self._format_message(message)
+                                    if formatted_msg.strip():
+                                        all_texts.append(formatted_msg)
+                    except Exception as e:
+                        logger.warning(f"Failed to fetch messages from channel {channel}: {e}")
+                        continue

            return all_texts

--- a/apps/slack_rag.py
+++ b/apps/slack_rag.py
@@ -146,8 +146,11 @@ class SlackMCPRAG(BaseRAGExample):
        if args.workspace_name:
            print(f"Workspace: {args.workspace_name}")

-        if args.channels:
-            print(f"Channels: {', '.join(args.channels)}")
+        # Filter out empty strings from channels
+        channels = [ch for ch in args.channels if ch.strip()] if args.channels else None
+
+        if channels:
+            print(f"Channels: {', '.join(channels)}")
        else:
            print("Fetching from all available channels")

@@ -166,7 +169,7 @@ class SlackMCPRAG(BaseRAGExample):
                retry_delay=args.retry_delay,
            )

-            texts = await reader.read_slack_data(channels=args.channels)
+            texts = await reader.read_slack_data(channels=channels)

            if not texts:
                print("No messages found! This could mean:")
--- a/docs/slack-setup-guide.md
+++ b/docs/slack-setup-guide.md
@@ -120,22 +120,36 @@ python -m apps.slack_rag \
  --query "What did we discuss about the project?"
 ```

-### 4.3 Real RAG Query Example
+### 4.3 Real RAG Query Examples

-This example demonstrates a successful Slack RAG integration query against the Sky Lab Computing workspace's "random" channel. The system successfully retrieves actual conversation messages and performs semantic search with high relevance scores, including finding specific LEANN announcements.
+This section demonstrates successful Slack RAG integration queries against the Sky Lab Computing workspace's "random" channel. The system successfully retrieves actual conversation messages and performs semantic search with high relevance scores, including finding specific research paper announcements and technical discussions.

 **Key Features Demonstrated:**
 - **Real Slack Integration**: Successfully connects to Slack via MCP server
 - **Actual Message Retrieval**: Fetches real conversation history including specific announcements
 - **Working RAG Pipeline**: Complete index building, search, and response generation
- **High Relevance Search**: Successfully finds and retrieves LEANN announcement messages
- **Challenging Query**: Demonstrates ability to find specific content within conversation history
+- **High Relevance Search**: Successfully finds and retrieves specific research paper messages
+- **Individual Message Processing**: Demonstrates ability to find specific content within conversation history

-### Screenshots
+### Example 1: Advisor Models Query

-![Sky Random RAG - Real Slack Integration](videos/slack_integration.png)
+**Query:** "train black-box models to adopt to your personal data"

-![Sky Random RAG - Real Slack Integration Results](videos/slack_integration_2.png)
+This query demonstrates the system's ability to find specific research announcements about training black-box models for personal data adaptation.
+
+![Advisor Models Query - Setup](videos/slack_integration_1.png)
+
+![Advisor Models Query - Results](videos/slack_integration_1.2.png)
+
+### Example 2: Barbarians at the Gate Query
+
+**Query:** "AI-driven research systems ADRS"
+
+This query demonstrates the system's ability to find specific research announcements about AI-driven research systems and algorithm discovery.
+
+![Barbarians Query - Setup](videos/slack_integration_2.1.png)
+
+![Barbarians Query - Results](videos/slack_integration_2.2.png)

 ### Prerequisites

@@ -147,39 +161,38 @@ This example demonstrates a successful Slack RAG integration query against the S
 1) Set the workspace token for this shell

 ```bash
-export SLACK_MCP_XOXP_TOKEN="xoxb-***-redacted-***"
+export SLACK_MCP_XOXP_TOKEN="xoxp-***-redacted-***"
 ```

-2) Run a real query against the "random" channel by channel ID (C0GN5BX0F)
+2) Run queries against the "random" channel by channel ID (C0GN5BX0F)

+**Advisor Models Query:**
 ```bash
 python -m apps.slack_rag \
  --mcp-server "slack-mcp-server" \
  --workspace-name "Sky Lab Computing" \
  --channels C0GN5BX0F \
-  --max-messages-per-channel 1000 \
-  --query "What is LEANN about?"
+  --max-messages-per-channel 100000 \
+  --query "train black-box models to adopt to your personal data" \
+  --llm simulated \
+  --no-concatenate-conversations \
+  --force-rebuild
 ```

-Expected: The system should retrieve the LEANN announcement message from Yichuan Wang, which contains:
-
-```
-Yichuan Wang Aug 8th at 3:27 PM
-We'd like to share LEANN — a local RAG system with a 97% smaller index that lets you chat with all your emails, file system, and more. It's fully Claude Code–compatible via a built-in semantic search MCP server.
-
-:loudspeaker: Tweet: https://x.com/YichuanM/status/1953886752240013803 (reposts appreciated :raised_hands:)
-:computer: Code: https://github.com/yichuan-w/LEANN (stars/shares welcome) (edited)
-
-X (formerly Twitter)X (formerly Twitter)
-Yichuan Wang (@YichuanM) on X
-1/N :rocket: Launching LEANN — the tiniest vector index on Earth!
-Fast, accurate, and 100% private RAG on your MacBook.
-0% internet. 97% smaller. Semantic search on everything.
-Your personal Jarvis, ready to dive into your emails, chats, and more.
-:link: Code: https://t.co/QwkYx1t0oa
+**Barbarians at the Gate Query:**
+```bash
+python -m apps.slack_rag \
+  --mcp-server "slack-mcp-server" \
+  --workspace-name "Sky Lab Computing" \
+  --channels C0GN5BX0F \
+  --max-messages-per-channel 100000 \
+  --query "AI-driven research systems ADRS" \
+  --llm simulated \
+  --no-concatenate-conversations \
+  --force-rebuild
 ```

-This demonstrates the system's ability to find and retrieve specific announcements about LEANN from the conversation history.
+These examples demonstrate the system's ability to find and retrieve specific research announcements and technical discussions from the conversation history, showcasing the power of semantic search in Slack data.

 3) Optional: Ask a broader question

--- a/docs/videos/slack_integration.png
+++ b/docs/videos/slack_integration.png
--- a/docs/videos/slack_integration_1.2.png
+++ b/docs/videos/slack_integration_1.2.png
--- a/docs/videos/slack_integration_1.png
+++ b/docs/videos/slack_integration_1.png
--- a/docs/videos/slack_integration_2.1.png
+++ b/docs/videos/slack_integration_2.1.png
--- a/docs/videos/slack_integration_2.2.png
+++ b/docs/videos/slack_integration_2.2.png
--- a/docs/videos/slack_integration_2.png
+++ b/docs/videos/slack_integration_2.png