Agent 怎么测试？从单元测试到端到端测试

你的 Agent 写完了，跑了几个例子觉得没问题。上线后，用户输入了一个特殊字符，Agent 崩溃了。

这不是偶然。Agent 的输出是 LLM 生成的，不确定性很高。传统软件测试方法不适用。

这篇文章，我系统讲解 Agent 测试的完整方法论。

Agent 测试的三大难点

难点一：输出不确定性

传统测试：

def test_add():
    assert add(1, 2) == 3  # 确定的

Agent 测试：

def test_chat():
    response = agent.chat("你好")
    assert response == "???"  # 不确定！每次可能不一样

难点二：依赖外部服务

Agent 依赖 LLM API，测试时：

不能每次都调用真实 API（成本高、慢）
需要 Mock，但 Mock LLM 的行为很复杂

难点三：多步骤执行

Agent 是"思考→执行→再思考"的循环，不是单次函数调用。

用户输入 → 思考 → 工具调用1 → 思考 → 工具调用2 → 思考 → 输出

测试要验证：
- 思考逻辑对不对
- 工具调用对不对
- 最终输出对不对

测试金字塔：Agent 版

         ┌─────────────┐
         │  E2E 测试   │  最少（1-3 个）
         │  （全链路）  │
         ├─────────────┤
         │ 集成测试     │  少量（10-20 个）
         │（真实 LLM）  │
         ├─────────────┤
         │ 单元测试     │  大量（100+ 个）
         │（Mock LLM）  │
         └─────────────┘

原则：

单元测试：快、多、覆盖细节
集成测试：慢、少、验证真实行为
E2E 测试：最慢、最少、验证完整流程

第一层：单元测试

测试工具函数

工具是纯函数，最容易测。

# tests/test_tools.py
import pytest
from tempfile import TemporaryDirectory
from pathlib import Path
from my_agent.tools import read_file, write_file, list_files

class TestReadFile:
    """测试 read_file 工具"""
    
    def test_read_existing_file(self):
        """测试读取存在的文件"""
        with TemporaryDirectory() as tmpdir:
            test_file = Path(tmpdir) / "test.txt"
            test_file.write_text("Hello, World!", encoding="utf-8")
            
            result = read_file.invoke({"path": str(test_file)})
            
            assert "Hello, World!" in result
            assert "错误" not in result.lower()
    
    def test_read_nonexistent_file(self):
        """测试读取不存在的文件"""
        result = read_file.invoke({"path": "/nonexistent/file.txt"})
        
        assert "错误" in result or "不存在" in result
    
    def test_read_file_too_large(self):
        """测试读取超大文件"""
        with TemporaryDirectory() as tmpdir:
            large_file = Path(tmpdir) / "large.txt"
            large_file.write_bytes(b"x" * (15 * 1024 * 1024))  # 15MB
            
            result = read_file.invoke({
                "path": str(large_file),
                "max_size_mb": 10,
            })
            
            assert "过大" in result or "超过" in result
    
    def test_read_binary_file(self):
        """测试读取二进制文件"""
        with TemporaryDirectory() as tmpdir:
            binary_file = Path(tmpdir) / "binary.bin"
            binary_file.write_bytes(b"\x00\x01\x02\x03")
            
            result = read_file.invoke({"path": str(binary_file)})
            
            # 应该报错或提示编码问题
            assert "错误" in result or "编码" in result or "文本" in result

class TestWriteFile:
    """测试 write_file 工具"""
    
    def test_write_new_file(self):
        """测试写入新文件"""
        with TemporaryDirectory() as tmpdir:
            test_file = Path(tmpdir) / "new.txt"
            
            result = write_file.invoke({
                "path": str(test_file),
                "content": "Test content",
            })
            
            assert "成功" in result
            assert test_file.exists()
            assert test_file.read_text() == "Test content"
    
    def test_overwrite_existing_file(self):
        """测试覆盖已存在的文件"""
        with TemporaryDirectory() as tmpdir:
            test_file = Path(tmpdir) / "existing.txt"
            test_file.write_text("Old content")
            
            result = write_file.invoke({
                "path": str(test_file),
                "content": "New content",
            })
            
            assert "成功" in result
            assert test_file.read_text() == "New content"
    
    def test_write_to_system_directory(self):
        """测试写入系统目录（应该被拒绝）"""
        result = write_file.invoke({
            "path": "/etc/test.txt",
            "content": "hacked",
        })
        
        assert "拒绝" in result or "不允许" in result or "错误" in result

测试状态管理

# tests/test_state.py
import pytest
from my_agent.state import AgentState, merge_lists

class TestState:
    """测试状态管理"""
    
    def test_merge_lists(self):
        """测试列表合并"""
        left = [{"role": "user", "content": "Hello"}]
        right = [{"role": "assistant", "content": "Hi"}]
        
        result = merge_lists(left, right)
        
        assert len(result) == 2
        assert result[0]["content"] == "Hello"
        assert result[1]["content"] == "Hi"
    
    def test_merge_lists_with_none(self):
        """测试 None 处理"""
        result = merge_lists(None, [{"role": "user", "content": "Test"}])
        
        assert len(result) == 1
        assert result[0]["content"] == "Test"
    
    def test_state_type_validation(self):
        """测试状态类型验证"""
        state: AgentState = {
            "messages": [],
            "tool_calls": [],
            "tool_results": [],
            "is_complete": False,
            "iteration": 0,
        }
        
        # 验证必需字段
        assert "messages" in state
        assert "tool_calls" in state
        assert "is_complete" in state

测试路由逻辑

# tests/test_router.py
import pytest
from my_agent.router import should_continue

class TestRouter:
    """测试路由逻辑"""
    
    def test_continue_when_has_tool_calls(self):
        """有工具调用时应该继续"""
        state = {
            "messages": [],
            "tool_calls": [{"name": "read_file", "args": {}}],
            "is_complete": False,
            "iteration": 0,
        }
        
        result = should_continue(state)
        
        assert result == "continue"
    
    def test_end_when_complete(self):
        """任务完成时应该结束"""
        state = {
            "messages": [],
            "tool_calls": [],
            "is_complete": True,
            "iteration": 5,
        }
        
        result = should_continue(state)
        
        assert result == "end"
    
    def test_end_when_max_iterations(self):
        """达到最大迭代次数应该结束"""
        state = {
            "messages": [],
            "tool_calls": [{"name": "read_file", "args": {}}],
            "is_complete": False,
            "iteration": 20,  # 达到上限
        }
        
        result = should_continue(state)
        
        assert result == "end"  # 强制结束，防止无限循环

第二层：集成测试（Mock LLM）

Mock LLM 响应

# tests/test_agent_integration.py
import pytest
from unittest.mock import Mock, patch
from my_agent.graph import build_agent_graph
from my_agent.state import create_initial_state

class TestAgentIntegration:
    """集成测试（Mock LLM）"""
    
    @pytest.fixture
    def mock_llm(self):
        """创建 Mock LLM"""
        mock = Mock()
        
        # 模拟 LLM 响应
        mock.invoke.return_value = Mock(
            content="我已读取文件",
            tool_calls=[{
                "name": "read_file",
                "args": {"path": "README.md"},
                "id": "call_123",
            }]
        )
        
        return mock
    
    def test_agent_calls_correct_tool(self, mock_llm):
        """测试 Agent 调用正确的工具"""
        with patch("my_agent.nodes.get_llm", return_value=mock_llm):
            graph = build_agent_graph()
            state = create_initial_state("读取 README.md")
            
            result = graph.invoke(state)
            
            # 验证：调用了 read_file 工具
            assert len(result["tool_calls"]) > 0
            assert result["tool_calls"][0]["name"] == "read_file"
    
    def test_agent_completes_after_tool_execution(self, mock_llm):
        """测试工具执行后 Agent 完成"""
        # 第一次调用返回工具调用
        mock_llm.invoke.side_effect = [
            Mock(
                content="",
                tool_calls=[{"name": "read_file", "args": {"path": "test.txt"}, "id": "call_1"}]
            ),
            # 第二次调用返回最终回复
            Mock(
                content="文件内容是...",
                tool_calls=[]
            )
        ]
        
        with patch("my_agent.nodes.get_llm", return_value=mock_llm):
            graph = build_agent_graph()
            state = create_initial_state("读取 test.txt")
            
            result = graph.invoke(state)
            
            # 验证：Agent 最终完成了
            assert result["is_complete"] == True
            assert len(result["messages"]) > 1

测试完整流程

def test_agent_full_workflow():
    """测试 Agent 完整工作流"""
    
    # 准备测试文件
    with TemporaryDirectory() as tmpdir:
        test_file = Path(tmpdir) / "test.txt"
        test_file.write_text("Hello from test file")
        
        # 创建 Agent
        graph = build_agent_graph()
        
        # Mock LLM
        with patch("my_agent.nodes.llm_with_tools") as mock_llm:
            mock_llm.invoke.side_effect = [
                # 第一次：决定调用工具
                Mock(tool_calls=[{
                    "name": "read_file",
                    "args": {"path": str(test_file)},
                    "id": "call_1"
                }]),
                # 第二次：返回结果
                Mock(content="文件内容是 'Hello from test file'", tool_calls=[])
            ]
            
            # 执行
            state = create_initial_state(f"读取 {test_file}")
            result = graph.invoke(state)
            
            # 验证
            assert result["is_complete"] == True
            assert len(result["tool_results"]) == 1
            assert "Hello from test file" in result["tool_results"][0]

第三层：端到端测试（真实 LLM）

测试真实 Agent

# tests/test_agent_e2e.py
import pytest
from my_agent.graph import build_agent_graph
from my_agent.state import create_initial_state

@pytest.fixture
def real_agent():
    """创建真实 Agent（需要真实 API Key）"""
    return build_agent_graph()

@pytest.mark.integration
class TestAgentE2E:
    """端到端测试（真实 LLM）"""
    
    def test_simple_query(self, real_agent):
        """测试简单查询"""
        state = create_initial_state("现在几点了？")
        
        result = real_agent.invoke(state)
        
        # 验证：有输出且完成
        assert result["is_complete"] == True
        assert len(result["messages"]) > 0
        
        # 验证：输出包含时间信息
        last_msg = result["messages"][-1].content
        assert any(kw in last_msg for kw in ["时间", "点", ":", "时"])
    
    def test_file_operation(self, real_agent, tmp_path):
        """测试文件操作"""
        test_file = tmp_path / "test.txt"
        test_file.write_text("Test content for E2E")
        
        state = create_initial_state(f"读取 {test_file} 的内容")
        
        result = real_agent.invoke(state)
        
        # 验证：读取成功
        assert "Test content for E2E" in str(result["messages"])
    
    def test_error_handling(self, real_agent):
        """测试错误处理"""
        state = create_initial_state("读取 /nonexistent/file.txt")
        
        result = real_agent.invoke(state)
        
        # 验证：没有崩溃，返回了错误信息
        assert result["is_complete"] == True
        assert any(
            "错误" in str(msg) or "不存在" in str(msg)
            for msg in result["messages"]
        )

跳过集成测试

# 默认不运行集成测试（需要 API Key 和费用）
# 运行方式：pytest -m integration tests/

# pytest.ini
[pytest]
markers =
    integration: marks tests as integration tests (deselect with '-m "not integration"')

测试数据管理

测试用例数据化

# tests/test_cases.py

TEST_CASES = [
    {
        "name": "simple_greeting",
        "input": "你好",
        "expected_keywords": ["你好", "帮助", "什么"],
        "should_call_tools": False,
    },
    {
        "name": "read_file",
        "input": "读取 README.md",
        "expected_tools": ["read_file"],
        "expected_keywords": ["README", "内容"],
    },
    {
        "name": "complex_task",
        "input": "列出所有 Python 文件并搜索 TODO",
        "expected_tools": ["list_files", "search_in_file"],
    },
]

@pytest.mark.parametrize("case", TEST_CASES)
def test_agent_cases(case, real_agent):
    """参数化测试"""
    state = create_initial_state(case["input"])
    result = real_agent.invoke(state)
    
    # 验证工具调用
    if "expected_tools" in case:
        tool_names = [tc["name"] for tc in result.get("tool_calls", [])]
        for expected_tool in case["expected_tools"]:
            assert expected_tool in tool_names
    
    # 验证关键词
    if "expected_keywords" in case:
        last_msg = str(result["messages"][-1].content)
        assert any(kw in last_msg for kw in case["expected_keywords"])

性能测试

测试响应时间

import time

def test_agent_response_time(real_agent):
    """测试 Agent 响应时间"""
    state = create_initial_state("你好")
    
    start = time.time()
    result = real_agent.invoke(state)
    elapsed = time.time() - start
    
    # 验证：响应时间小于 10 秒
    assert elapsed < 10, f"响应时间过长: {elapsed:.2f}s"
    print(f"响应时间: {elapsed:.2f}s")

测试 Token 消耗

def test_agent_token_usage(real_agent):
    """测试 Agent Token 消耗"""
    state = create_initial_state("读取 README.md")
    
    result = real_agent.invoke(state)
    
    # 计算总 token
    total_tokens = sum(
        len(msg.content.split()) * 1.3  # 粗略估算
        for msg in result["messages"]
        if hasattr(msg, "content")
    )
    
    # 验证：Token 消耗合理（小于 5000）
    assert total_tokens < 5000, f"Token 消耗过高: {total_tokens}"
    print(f"估算 Token: {total_tokens}")

测试覆盖率

配置 pytest-cov

# pytest.ini
[pytest]
addopts = --cov=my_agent --cov-report=html --cov-report=term
testpaths = tests

运行覆盖率测试

pytest --cov=my_agent --cov-report=html

目标：

工具函数：100%
状态管理：90%+
节点逻辑：80%+
整体：70%+

我踩过的真实坑

坑一：忘了 Mock 导致费用爆炸

# 错误：没有 Mock
def test_agent():
    agent = build_agent_graph()  # 使用真实 LLM
    result = agent.invoke(...)   # 每次测试都调用 API

解决：默认 Mock，集成测试单独标记。

坑二：测试依赖环境

# 错误：硬编码路径
def test_read_file():
    result = read_file.invoke({"path": "/Users/xxx/test.txt"})

解决：用临时目录。

def test_read_file():
    with TemporaryDirectory() as tmpdir:
        test_file = Path(tmpdir) / "test.txt"
        ...

坑三：测试不够独立

# 错误：测试互相依赖
def test_1():
    global_state["value"] = 1

def test_2():
    assert global_state["value"] == 1  # 依赖 test_1

解决：每个测试独立初始化状态。

下一步行动

给工具写单元测试：至少 3 个测试用例（正常、异常、边界）
Mock LLM 写集成测试：验证 Agent 调用了正确的工具
标记集成测试：用 @pytest.mark.integration 单独运行
配置覆盖率：确保关键代码被测试覆盖

Agent 测试的核心是：不测输出内容，测行为和流程。工具测函数，决策测逻辑，流程测完整性。

Agent 测试的三大难点​

难点一：输出不确定性​

难点二：依赖外部服务​

难点三：多步骤执行​

测试金字塔：Agent 版​

第一层：单元测试​

测试工具函数​

测试状态管理​

测试路由逻辑​

第二层：集成测试（Mock LLM）​

Mock LLM 响应​

测试完整流程​

第三层：端到端测试（真实 LLM）​

测试真实 Agent​

跳过集成测试​

测试数据管理​

测试用例数据化​

性能测试​

测试响应时间​

测试 Token 消耗​

测试覆盖率​

配置 pytest-cov​

运行覆盖率测试​

我踩过的真实坑​

坑一：忘了 Mock 导致费用爆炸​

坑二：测试依赖环境​

坑三：测试不够独立​

下一步行动​