Mocking the OpenAI API with respx in Python
View the series
- See how I used the OpenAI API to generate audio and images
- See why structured outputs also need hard guardrails
- Grab ready-to-use pytest snippets mocking the OpenAI API
How do you test the parts of your application that call the OpenAI API?
-
Do real calls to the API
-
Mock the API
In Python, since the
OpenAI API library ↗
is built on the
httpx ↗
HTTP client, we can use
respx ↗
to mock it. This library mocks httpx.
So it fits this use case well.
Here's one way to do it with pytest ↗:
# test_mock_openai_api_with_respx.py
from openai import OpenAI
from respx import MockRouter
import httpx
import pytest
def foo(client: OpenAI):
response = client.responses.parse(
model="gpt-5.2",
input="reply only with foo",
max_output_tokens=256,
)
return response.output_text
@pytest.mark.respx(base_url="https://api.openai.com/v1/")
def test_foo(respx_mock: MockRouter):
client = OpenAI(api_key="foo-api-key")
# Not a complete response from OpenAI. Only the "output"
# part that we need with output[0].content[0].text set to foo
json_data = {
"output": [
{
"type": "message",
"id": "msg_1",
"status": "completed",
"role": "assistant",
"content": [
{
"type": "output_text",
"text": "foo",
"annotations": [],
}
],
}
]
}
# respx intercepts POST requests to https://api.openai.com/v1/responses
# and returns a 200 status with a json_data payload
respx_mock.post("/responses").mock(
return_value=httpx.Response(
200,
json=json_data,
)
)
assert foo(client) == "foo"
You can run this test like this:
$ uv init
$ uv add pytest respx openai
$ uv run pytest test_mock_openai_api_with_respx.py
If you're testing retry logic, for instance,
you may want the mocked API to return different responses in a set
order. You can do that with the
side_effect
parameter of the
mock method. Use it instead
of return_value and pass it
an iterable.
Here's an example where the function
foo first returns
foo, then
bar, and finally
baz:
@pytest.mark.respx(base_url="https://api.openai.com/v1/")
def test_foo_bar_baz(respx_mock: MockRouter):
client = OpenAI(api_key="foo-api-key")
def json_data(msg_id: str, output_text: str):
return {
"output": [
{
"type": "message",
"id": msg_id,
"status": "completed",
"role": "assistant",
"content": [
{
"type": "output_text",
"text": output_text,
"annotations": [],
}
],
}
]
}
respx_mock.post("/responses").mock(
side_effect=[
httpx.Response(
200,
json=json_data("msg_1", "foo"),
),
httpx.Response(
200,
json=json_data("msg_2", "bar"),
),
httpx.Response(
200,
json=json_data("msg_3", "baz"),
),
]
)
assert foo(client) == "foo"
assert foo(client) == "bar"
assert foo(client) == "baz"
Instead of an iterable, you can pass a function to call. Or you can pass an exception to raise. See the respx docs ↗.
Find more examples below. They're from my phrasebook-fr-to-en ↗ CLI. It enriches French-to-English phrasebooks with translations, audio, and images using the OpenAI API.
That's all I have for today! Talk soon 👋
No spam. Unsubscribe anytime.
The function to be tested - generate_translations
def generate_translations(
record_original: tuple[str, str, str], client: OpenAI
) -> list[tuple[str, str]]:
from pydantic import BaseModel
class Translation(BaseModel):
french: str
english: str
class Translations(BaseModel):
# DON'T USE: conlist(tuple[str, str], min_length=2, max_length=2)
# This broke OpenAI API which generated outputs with 128,000 tokens.
# Mostly, whitespaces and newlines.
translations: list[Translation]
_, french, english = record_original
model = "gpt-5.2"
instructions = """# Role and Objective
You are a bilingual (French/English) teacher specializing in practical language learning. Your task is to help expand a French-to-English phrasebook by creating relevant sentence pairs and highlighting key language aspects.
# Instructions
- For each prompt, you will get a French sentence and its English translation.
- Your tasks:
1. Generate exactly two related English sentences, each with its French translation.
2. Use these to show:
- An English grammar point, or
- Useful nouns, verbs, or
- Alternative phrasing (formality, slang, etc.).
3. Ensure all English examples are natural and suitable for daily use.
# Context
- The learner is a native French speaker advancing in English.
- The goal is to create a learner-friendly, practical phrasebook."""
input_msg = f"{french} -> {english}"
logger.info(f"Generating translations for record {record_original}")
attempt = 1
translations = []
while not translations:
with log_request_info_when_api_error_raised():
response = client.responses.parse(
model=model,
instructions=instructions,
input=input_msg,
text_format=Translations,
max_output_tokens=256,
)
# If we decided to use gpt-5-nano with the same low max_output_tokens,
# tokens would be consumed by the reasoning, we would get
# no text output, and this would result in output_parsed being None.
if not response.output_parsed:
if attempt < 3:
logger.info(
f"No translations were returned by the model at attempt {attempt}."
)
attempt += 1
continue
else:
raise ValueError(
f"No translations were returned by the model.\nResponse: {response.to_json()}"
)
translations = response.output_parsed.translations
if (tlen := len(translations)) < 2:
if attempt < 3:
logger.info(
f"Wrong number of translations returned by the model at attempt {attempt}."
)
attempt += 1
translations = []
continue
else:
raise ValueError(
(
f"Wrong number of translations: {tlen}. 2 were expected.\n"
f"Response: {response.to_json()}"
)
)
logger.info(
f"Translations generated for record {record_original} using model {model} and input '{input_msg}'"
)
return [(t.french, t.english) for t in translations][:2]
Test mocking the Responses API, HTTP 200 OK
@pytest.mark.respx(base_url="https://api.openai.com/v1/")
def test_generate_translations(
respx_mock: MockRouter, caplog: pytest.LogCaptureFixture
):
caplog.set_level(logging.INFO, logger="phrasebook_fr_to_en.cli")
record = ("2025-12-15", "fr1", "en1")
client = OpenAI(api_key="foo-api-key")
def partial_json_response(output_id: str, output_text: str):
return {
"output": [
{
"type": "message",
"id": f"msg_{output_id}",
"status": "completed",
"role": "assistant",
"content": [
{
"type": "output_text",
"text": output_text,
"annotations": [],
}
],
}
]
}
# We receive exactly 2 translations and this is what we want
respx_mock.post("/responses").mock(
return_value=httpx.Response(
200,
json=partial_json_response(
"id_1",
'{"translations":[{"french":"fr2","english":"en2"},{"french":"fr3","english":"en3"}]}',
),
)
)
translations = cli.generate_translations(record, client)
assert translations == [("fr2", "en2"), ("fr3", "en3")]
assert (
"Generating translations for record ('2025-12-15', 'fr1', 'en1')" in caplog.text
)
assert (
"Translations generated for record ('2025-12-15', 'fr1', 'en1')" in caplog.text
)
assert "using model gpt-5.2 and input 'fr1 -> en1'" in caplog.text
caplog.clear()
# First request returns 3 translations -> We take the first 2
# Second request would return 2 translations, which is ok, but we
# never send that second request because we stopped at the first one.
respx_mock.post("/responses").mock(
side_effect=[
httpx.Response(
200,
json=partial_json_response(
"id_1",
'{"translations":[{"french":"fr2","english":"en2"},{"french":"fr3","english":"en3"}, {"french":"fr4","english":"en4"}]}',
),
),
httpx.Response(
200,
json=partial_json_response(
"id_2",
'{"translations":[{"french":"frA","english":"enA"},{"french":"frB","english":"enB"}]}',
),
),
]
)
translations = cli.generate_translations(record, client)
assert translations == [("fr2", "en2"), ("fr3", "en3")]
# 3 retries with the 3rd OK
# First request returns 1 translation -> should be 2 so we retry
# Second request returns no translation -> should be 2 so we retry
# Third request returns 2 translations -> this is ok
respx_mock.post("/responses").mock(
side_effect=[
httpx.Response(
200,
json=partial_json_response(
"id_1",
'{"translations":[{"french":"fr2","english":"en2"}]}',
),
),
httpx.Response(
200,
json={
"incomplete_details": {"reason": "max_output_tokens"},
"output": [
{
"id": "rs_0c8b0343bd64d781006971f5c6041c8194b28661972de6acc2",
"summary": [],
"type": "reasoning",
}
],
},
),
httpx.Response(
200,
json=partial_json_response(
"id_2",
'{"translations":[{"french":"fr2","english":"en2"},{"french":"fr3","english":"en3"}]}',
),
),
]
)
translations = cli.generate_translations(record, client)
assert translations == [("fr2", "en2"), ("fr3", "en3")]
assert "No translations were returned by the model at attempt 2." in caplog.text
assert (
"Wrong number of translations returned by the model at attempt 1."
in caplog.text
)
# Raise an error because we receive only one translation pair each
# time we do a request to the API (at the 3rd attempt we raise an error)
respx_mock.post("/responses").mock(
return_value=httpx.Response(
200,
json=partial_json_response(
"id_1",
'{"translations":[{"french":"fr2","english":"en2"}]}',
),
),
)
with pytest.raises(ValueError, match="Wrong number of translations: 1."):
translations = cli.generate_translations(record, client)
# Raise an error because we receive no translation pair each time we
# do a request to the API (at the 3rd attempt we raise an error)
# This can happens if you use for instance gpt-5-nano with
# a limited amount output token that entirely consumed by the
# reasoning.
# We're not using reasoning of gpt-5.2 but just in case.
respx_mock.post("/responses").mock(
return_value=httpx.Response(
200,
json={
"incomplete_details": {"reason": "max_output_tokens"},
"output": [
{
"id": "rs_0c8b0343bd64d781006971f5c6041c8194b28661972de6acc2",
"summary": [],
"type": "reasoning",
}
],
},
)
)
with pytest.raises(ValueError, match="No translations were returned by the model."):
translations = cli.generate_translations(record, client)
Test mocking the Responses API, HTTP 401 Unauthorized
@pytest.mark.respx(base_url="https://api.openai.com/v1/")
def test_generate_translations_request_logged_when_api_error_raised(
respx_mock: MockRouter, caplog: pytest.LogCaptureFixture
):
caplog.set_level(logging.INFO, logger="phrasebook_fr_to_en.cli")
respx_mock.post("/responses").mock(
return_value=httpx.Response(
401, json={"error": {"message": "Incorrect API key provided"}}
)
)
record = ("2025-12-15", "fr1", "en1")
client = OpenAI(api_key="foo-api-key", max_retries=0)
with pytest.raises(APIError):
translations = cli.generate_translations(record, client)
# Log httpx request
assert "<Request('POST', 'https://api.openai.com/v1/responses')>" in caplog.text
# Log httpx headers: Headers({'host': 'api.openai.com', ...})
assert "Request headers -" in caplog.text
assert "'host': 'api.openai.com'" in caplog.text
# Ensure API key not log in headers
assert "foo-api-key" not in caplog.text
# Log httpx body request
assert re.search(r"Request body - .*\"input\"\s*:\s*\"fr1 -> en1\"", caplog.text)
Test with real calls to the Responses API
@pytest.mark.skipif(
os.getenv("OPENAI_LIVE") != "1",
reason="Requires OPENAI_LIVE=1. In that case, we do real call to OpenAI API.",
)
def test_generate_translations_real(caplog: pytest.LogCaptureFixture):
caplog.set_level(logging.INFO, logger="phrasebook_fr_to_en.cli")
record = ("2025-12-15", "Il est beau.", "He is handsome.")
client = OpenAI()
translations = cli.generate_translations(record, client)
# Raise an error if `translations` not valid against `TranslationList`
TranslationList = conlist(tuple[str, str], min_length=2, max_length=2)
TypeAdapter(TranslationList).validate_python(translations)
assert (
"Translations generated for record ('2025-12-15', 'Il est beau.', 'He is handsome.')"
in caplog.text
)
assert (
"using model gpt-5.2 and input 'Il est beau. -> He is handsome.'" in caplog.text
)