Mocking the OpenAI API with respx in Python
tldr: You can test code that calls the OpenAI API by mocking httpx. I show a simple pytest + respx setup, including retry simulations with the side_effect parameter. This is something I used to test my phrasebook CLI, which generates translations, audio, and images.
How do you test the parts of your application that call the OpenAI API?
- Do real calls to the API
- Mock the API
In Python, since the OpenAI API library is built on the httpx HTTP client, we can use respx to mock it. This library mocks httpx. So it fits this use case well.
Here's one way to do it with pytest:
# test_mock_openai_api_with_respx.py
from openai import OpenAI
from respx import MockRouter
import httpx
import pytest
def foo(client: OpenAI):
response = client.responses.create(
model="gpt-5.2",
input="reply only with foo",
max_output_tokens=256,
)
return response.output_text
@pytest.mark.respx(base_url="https://api.openai.com/v1/")
def test_foo(respx_mock: MockRouter):
client = OpenAI(api_key="foo-api-key")
# Not a complete response from OpenAI. Only the "output"
# part that we need with output[0].content[0].text set to foo
json_data = {
"output": [
{
"type": "message",
"id": "msg_1",
"status": "completed",
"role": "assistant",
"content": [
{
"type": "output_text",
"text": "foo",
"annotations": [],
}
],
}
]
}
# respx intercepts POST requests to https://api.openai.com/v1/responses
# and returns a 200 status with a json_data payload
respx_mock.post("/responses").mock(
return_value=httpx.Response(
200,
json=json_data,
)
)
assert foo(client) == "foo"
You can run this test like this:
If you're testing retry logic, for instance, you may want the mocked API to return different responses in a set order. You can do that with the side_effect parameter of the mock method. Use it instead of return_value and pass it an iterable.
Here's an example where the function foo first returns foo, then bar, and finally baz:
@pytest.mark.respx(base_url="https://api.openai.com/v1/")
def test_foo_bar_baz(respx_mock: MockRouter):
client = OpenAI(api_key="foo-api-key")
def json_data(msg_id: str, output_text: str):
return {
"output": [
{
"type": "message",
"id": msg_id,
"status": "completed",
"role": "assistant",
"content": [
{
"type": "output_text",
"text": output_text,
"annotations": [],
}
],
}
]
}
respx_mock.post("/responses").mock(
side_effect=[
httpx.Response(
200,
json=json_data("msg_1", "foo"),
),
httpx.Response(
200,
json=json_data("msg_2", "bar"),
),
httpx.Response(
200,
json=json_data("msg_3", "baz"),
),
]
)
assert foo(client) == "foo"
assert foo(client) == "bar"
assert foo(client) == "baz"
Instead of an iterable, you can pass a function to call. Or you can pass an exception to raise. See the respx docs.
Find more examples below. They're from my phrasebook-fr-to-en CLI. It enriches French-to-English phrasebooks with translations, audio, and images using the OpenAI API.
That's all I have for today! Talk soonÂ đŸ‘‹
The function to be tested - generate_translations¶
def generate_translations(
record_original: tuple[str, str, str], client: OpenAI
) -> list[tuple[str, str]]:
from pydantic import BaseModel
class Translation(BaseModel):
french: str
english: str
class Translations(BaseModel):
# DON'T USE: conlist(tuple[str, str], min_length=2, max_length=2)
# This broke OpenAI API which generated outputs with 128,000 tokens.
# Mostly, whitespaces and newlines.
translations: list[Translation]
_, french, english = record_original
model = "gpt-5.2"
instructions = """# Role and Objective
You are a bilingual (French/English) teacher specializing in practical language learning. Your task is to help expand a French-to-English phrasebook by creating relevant sentence pairs and highlighting key language aspects.
# Instructions
- For each prompt, you will get a French sentence and its English translation.
- Your tasks:
1. Generate exactly two related English sentences, each with its French translation.
2. Use these to show:
- An English grammar point, or
- Useful nouns, verbs, or
- Alternative phrasing (formality, slang, etc.).
3. Ensure all English examples are natural and suitable for daily use.
# Context
- The learner is a native French speaker advancing in English.
- The goal is to create a learner-friendly, practical phrasebook."""
input_msg = f"{french} -> {english}"
logger.info(f"Generating translations for record {record_original}")
attempt = 1
translations = []
while not translations:
with log_request_info_when_api_error_raised():
response = client.responses.parse(
model=model,
instructions=instructions,
input=input_msg,
text_format=Translations,
max_output_tokens=256,
)
# If we decided to use gpt-5-nano with the same low max_output_tokens,
# tokens would be consumed by the reasoning, we would get
# no text output, and this would result in output_parsed being None.
if not response.output_parsed:
if attempt < 3:
logger.info(
f"No translations were returned by the model at attempt {attempt}."
)
attempt += 1
continue
else:
raise ValueError(
f"No translations were returned by the model.\nResponse: {response.to_json()}"
)
translations = response.output_parsed.translations
if (tlen := len(translations)) < 2:
if attempt < 3:
logger.info(
f"Wrong number of translations returned by the model at attempt {attempt}."
)
attempt += 1
translations = []
continue
else:
raise ValueError(
(
f"Wrong number of translations: {tlen}. 2 were expected.\n"
f"Response: {response.to_json()}"
)
)
logger.info(
f"Translations generated for record {record_original} using model {model} and input '{input_msg}'"
)
return [(t.french, t.english) for t in translations][:2]
Test mocking the Responses API, HTTP 200 OK¶
@pytest.mark.respx(base_url="https://api.openai.com/v1/")
def test_generate_translations(
respx_mock: MockRouter, caplog: pytest.LogCaptureFixture
):
caplog.set_level(logging.INFO, logger="phrasebook_fr_to_en.cli")
record = ("2025-12-15", "fr1", "en1")
client = OpenAI(api_key="foo-api-key")
def partial_json_response(output_id: str, output_text: str):
return {
"output": [
{
"type": "message",
"id": f"msg_{output_id}",
"status": "completed",
"role": "assistant",
"content": [
{
"type": "output_text",
"text": output_text,
"annotations": [],
}
],
}
]
}
# We receive exactly 2 translations and this is what we want
respx_mock.post("/responses").mock(
return_value=httpx.Response(
200,
json=partial_json_response(
"id_1",
'{"translations":[{"french":"fr2","english":"en2"},{"french":"fr3","english":"en3"}]}',
),
)
)
translations = cli.generate_translations(record, client)
assert translations == [("fr2", "en2"), ("fr3", "en3")]
assert (
"Generating translations for record ('2025-12-15', 'fr1', 'en1')" in caplog.text
)
assert (
"Translations generated for record ('2025-12-15', 'fr1', 'en1')" in caplog.text
)
assert "using model gpt-5.2 and input 'fr1 -> en1'" in caplog.text
caplog.clear()
# First request returns 3 translations -> We take the first 2
# Second request would return 2 translations, which is ok, but we
# never send that second request because we stopped at the first one.
respx_mock.post("/responses").mock(
side_effect=[
httpx.Response(
200,
json=partial_json_response(
"id_1",
'{"translations":[{"french":"fr2","english":"en2"},{"french":"fr3","english":"en3"}, {"french":"fr4","english":"en4"}]}',
),
),
httpx.Response(
200,
json=partial_json_response(
"id_2",
'{"translations":[{"french":"frA","english":"enA"},{"french":"frB","english":"enB"}]}',
),
),
]
)
translations = cli.generate_translations(record, client)
assert translations == [("fr2", "en2"), ("fr3", "en3")]
# 3 retries with the 3rd OK
# First request returns 1 translation -> should be 2 so we retry
# Second request returns no translation -> should be 2 so we retry
# Third request returns 2 translations -> this is ok
respx_mock.post("/responses").mock(
side_effect=[
httpx.Response(
200,
json=partial_json_response(
"id_1",
'{"translations":[{"french":"fr2","english":"en2"}]}',
),
),
httpx.Response(
200,
json={
"incomplete_details": {"reason": "max_output_tokens"},
"output": [
{
"id": "rs_0c8b0343bd64d781006971f5c6041c8194b28661972de6acc2",
"summary": [],
"type": "reasoning",
}
],
},
),
httpx.Response(
200,
json=partial_json_response(
"id_2",
'{"translations":[{"french":"fr2","english":"en2"},{"french":"fr3","english":"en3"}]}',
),
),
]
)
translations = cli.generate_translations(record, client)
assert translations == [("fr2", "en2"), ("fr3", "en3")]
assert "No translations were returned by the model at attempt 2." in caplog.text
assert (
"Wrong number of translations returned by the model at attempt 1."
in caplog.text
)
# Raise an error because we receive only one translation pair each
# time we do a request to the API (at the 3rd attempt we raise an error)
respx_mock.post("/responses").mock(
return_value=httpx.Response(
200,
json=partial_json_response(
"id_1",
'{"translations":[{"french":"fr2","english":"en2"}]}',
),
),
)
with pytest.raises(ValueError, match="Wrong number of translations: 1."):
translations = cli.generate_translations(record, client)
# Raise an error because we receive no translation pair each time we
# do a request to the API (at the 3rd attempt we raise an error)
# This can happens if you use for instance gpt-5-nano with
# a limited amount output token that entirely consumed by the
# reasoning.
# We're not using reasoning of gpt-5.2 but just in case.
respx_mock.post("/responses").mock(
return_value=httpx.Response(
200,
json={
"incomplete_details": {"reason": "max_output_tokens"},
"output": [
{
"id": "rs_0c8b0343bd64d781006971f5c6041c8194b28661972de6acc2",
"summary": [],
"type": "reasoning",
}
],
},
)
)
with pytest.raises(ValueError, match="No translations were returned by the model."):
translations = cli.generate_translations(record, client)
Test mocking the Responses API, HTTP 401 Unauthorized¶
@pytest.mark.respx(base_url="https://api.openai.com/v1/")
def test_generate_translations_request_logged_when_api_error_raised(
respx_mock: MockRouter, caplog: pytest.LogCaptureFixture
):
caplog.set_level(logging.INFO, logger="phrasebook_fr_to_en.cli")
respx_mock.post("/responses").mock(
return_value=httpx.Response(
401, json={"error": {"message": "Incorrect API key provided"}}
)
)
record = ("2025-12-15", "fr1", "en1")
client = OpenAI(api_key="foo-api-key", max_retries=0)
with pytest.raises(APIError):
translations = cli.generate_translations(record, client)
# Log httpx request
assert "<Request('POST', 'https://api.openai.com/v1/responses')>" in caplog.text
# Log httpx headers: Headers({'host': 'api.openai.com', ...})
assert "Request headers -" in caplog.text
assert "'host': 'api.openai.com'" in caplog.text
# Ensure API key not log in headers
assert "foo-api-key" not in caplog.text
# Log httpx body request
assert re.search(r"Request body - .*\"input\"\s*:\s*\"fr1 -> en1\"", caplog.text)
Test with real calls to the Responses API¶
@pytest.mark.skipif(
os.getenv("OPENAI_LIVE") != "1",
reason="Requires OPENAI_LIVE=1. In that case, we do real call to OpenAI API.",
)
def test_generate_translations_real(caplog: pytest.LogCaptureFixture):
caplog.set_level(logging.INFO, logger="phrasebook_fr_to_en.cli")
record = ("2025-12-15", "Il est beau.", "He is handsome.")
client = OpenAI()
translations = cli.generate_translations(record, client)
# Raise an error if `translations` not valid against `TranslationList`
TranslationList = conlist(tuple[str, str], min_length=2, max_length=2)
TypeAdapter(TranslationList).validate_python(translations)
assert (
"Translations generated for record ('2025-12-15', 'Il est beau.', 'He is handsome.')"
in caplog.text
)
assert (
"using model gpt-5.2 and input 'Il est beau. -> He is handsome.'" in caplog.text
)
References¶
- https://github.com/tonyaldon/phrasebook-fr-to-en
- https://platform.openai.com/docs/api-reference/responses
- https://github.com/openai/openai-python
- https://github.com/pytest-dev/pytest
- https://github.com/encode/httpx
- https://github.com/lundberg/respx
- https://lundberg.github.io/respx/guide/#mock-with-a-side-effect