import litellm
import osFrom Strings to Programs: A Practical Guide to Prompt Engineering & Optimization in Python
2025-09-06
TL;DR We’ll take a real, slightly messy task—extracting before‑tax and after‑tax totals from receipt images—and walk it from a brittle single prompt to a robust, testable program.
- Start with LiteLLM + a multi-modal language model as a baseline.
- Add structured output (Pydantic) + evaluation (a crisp metric).
- Batch experiments with Pixeltable.
- Build a minimal automatic prompt optimizer by hand (grounded in failing examples).
- Switch to DSPy to get prompts-as-programs, signatures, adapters, and MIPROv2 auto‑optimization.
- Try Chain‑of‑Thought and ReAct tools; learn when they help—and when they don’t.
- Finish with a simple multi‑output signature that guides reasoning; in our run this reached 11/11 with a larger model.
This tutorial is sponsored by Pixeltable
As your AI systems become more widely used, you may feel the urge to measure and optimize the quality of your prompts—the very foundation on which your entire system relies (I certainly did). By the end of this course, you’ll understand exactly what world-class optimizers like DSPy do, how to use DSPy, and how to customize it for your specific needs (for example, optimizing with DSPy and then running the optimized prompt on your own system).
Our focus here is on prompt optimization. While DSPy and its creators are world-renowned for prompt optimization, that is not actually DSPy’s primary focus. DSPy is designed to help you build compound AI systems—that is, systems where code, control flow, and one or many AI components are interleaved. To build such systems, tuning (i.e., optimization) is essential. As a result, DSPy offers very strong tooling for optimizing prompts. However, this capability is somewhat “baked in”: DSPy doesn’t explicitly promise to optimize prompts, but rather to optimize AI systems as a whole. Because of this, it can sometimes feel a bit like a black box.
I have created tutorials—and will create more—on how to use DSPy to build compound AI systems and why you should lean into that paradigm as soon as you want to call an AI with code. But in this tutorial, we’ll focus specifically on prompt optimization.
This tutorial is divided into three parts. First, we’ll build our own prompt optimizer; I believe there’s no better way to truly understand something than to build it ourselves. Second, we’ll see how much easier and more effective it can be with DSPy. Third, we’ll explore how to customize DSPy to leverage its prompt optimization capabilities while still being able to “leave” DSPy and carry the optimized prompt with us.
response = litellm.completion(
model="groq/meta-llama/llama-4-scout-17b-16e-instruct",
messages=[{ "content": "Hello, how are you?","role": "user"}],
api_key= os.environ["GROQ_API_KEY"] # this is not necessary but its to show you that you can provide you api key directly here
)
print(response.model_dump_json(indent=4)){
"id": "chatcmpl-8f134b77-3909-4dd5-b144-c1c148a33ec3",
"created": 1756043843,
"model": "meta-llama/llama-4-scout-17b-16e-instruct",
"object": "chat.completion",
"system_fingerprint": "fp_5436ed2ebe",
"choices": [
{
"finish_reason": "stop",
"index": 0,
"message": {
"content": "Hello! I'm just a language model, I don't have feelings like humans do, but I'm functioning properly and ready to help you with any questions or tasks you have! How can I assist you today?",
"role": "assistant",
"tool_calls": null,
"function_call": null
}
}
],
"usage": {
"completion_tokens": 41,
"prompt_tokens": 16,
"total_tokens": 57,
"completion_tokens_details": null,
"prompt_tokens_details": null,
"queue_time": 0.067231514,
"prompt_time": 0.00256158,
"completion_time": 0.096255738,
"total_time": 0.098817318
},
"usage_breakdown": null,
"x_groq": {
"id": "req_01k3e5x64sfpp8xwfvckd7gz5d"
},
"service_tier": "auto"
}
content_part1 = {
"type": "text",
"text": "Extract total and total before tax from the receipt"
}from PIL import Image
import io
import base64
img = Image.open("images/receipts/IMG_2160.jpg").convert("RGB")
buf = io.BytesIO()
img.save(buf, format="JPEG")
b64 = base64.b64encode(buf.getvalue()).decode()
content_part2 = {
"type": "image_url",
"image_url": {
"url": f"data:image/jpeg;base64,{b64}",
"format": "image/jpeg"
}
}messages = [{
"role": "user",
"content": [content_part1,content_part2]
}]response = litellm.completion(
model="groq/meta-llama/llama-4-scout-17b-16e-instruct",
messages= messages,
temperature = 0
)
print(response.model_dump_json(indent=4)){
"id": "chatcmpl-59ce83db-741f-4139-bd20-8992f3bbd14c",
"created": 1756044346,
"model": "meta-llama/llama-4-scout-17b-16e-instruct",
"object": "chat.completion",
"system_fingerprint": "fp_38b31b7025",
"choices": [
{
"finish_reason": "stop",
"index": 0,
"message": {
"content": "The total amount is $45.10.\n\nThe amount before tax is not explicitly stated, but we can calculate it by subtracting the tax amounts from the total. \n\nThe TPS (tax) is $1.96 and the TVQ (tax) is $3.91. \n\nSo, the amount before tax is: \n$45.10 - $1.96 - $3.91 = $39.23. \n\nTherefore, the total is $45.10 and the total before tax is $39.23.",
"role": "assistant",
"tool_calls": null,
"function_call": null
}
}
],
"usage": {
"completion_tokens": 108,
"prompt_tokens": 1910,
"total_tokens": 2018,
"completion_tokens_details": null,
"prompt_tokens_details": null,
"queue_time": 0.512238536,
"prompt_time": 0.053851619,
"completion_time": 0.259498072,
"total_time": 0.313349691
},
"usage_breakdown": null,
"x_groq": {
"id": "req_01k3e6cfdbegss78jm171zhenh"
},
"service_tier": "auto"
}
45.10 - 1.96 - 3.9139.230000000000004
content_part1 = {
"type": "text",
"text": """
Extract the after tax total and the before tax total from the receipt
Put these numbers in: <before_tax_total> and <after_tax_total> xml tags.
"""
}
response = litellm.completion(
model="groq/meta-llama/llama-4-scout-17b-16e-instruct",
messages= [{
"role": "user",
"content": [content_part1,content_part2]
}]
)
# Let's fetch only the actual llm response this time:
response.choices[0]["message"]["content"] # what a bad api :('To solve this problem, we need to identify the before-tax total and the after-tax total from the receipt.\n\nThe receipt shows:\n- Total cost before taxes for the fuel is not explicitly stated, but we can calculate it by subtracting the tax amounts from the total.\n- The TPS (tax) amount is $1.96.\n- The TVQ (tax) amount is $3.91.\n- The total cost after taxes is $45.10.\n\nFirst, let\'s calculate the before-tax total:\nBefore-tax total = After-tax total - (TPS + TVQ)\nBefore-tax total = $45.10 - ($1.96 + $3.91) \nBefore-tax total = $45.10 - $5.87 \nBefore-tax total = $39.23 \n\nHowever, the receipt directly provides the before-tax total under "CARBURANT" as $45.10 - $5.87 (taxes) implies the subtotal is indeed $39.23 but let\'s verify with given data.\n\nThe before-tax total or subtotal is indeed $39.23 as calculated.\n\nThe after-tax total is given as $45.10.\n\nSo, the XML tags with the required information would be:\n\n<before_tax_total>$39.23</before_tax_total>\n<after_tax_total>$45.10</after_tax_total>'
from pydantic import BaseModel, field_validator
from typing import Optional, Dict
import re
class ReceiptTotals(BaseModel):
before_tax_total: Optional[float] = None
after_tax_total: Optional[float] = None
@field_validator("*", mode="before")
def _clean(cls, v):
if v in (None, ""):
return None
cleaned = re.sub(r"[^\d.]", "", str(v))
return float(cleaned) if cleaned else None
raw = response.choices[0]["message"]["content"]
before = re.search(r"<before_tax_total>(.*?)</before_tax_total>", raw).group(1)
after = re.search(r"<after_tax_total>(.*?)</after_tax_total>", raw).group(1)
pred = ReceiptTotals(before_tax_total=before, after_tax_total=after)
predReceiptTotals(before_tax_total=39.23, after_tax_total=45.1)
pred.before_tax_total39.23
def metric(ground_truth, pred):
is_btax_same = ground_truth.before_tax_total == pred.before_tax_total
is_atax_same = ground_truth.after_tax_total == pred.after_tax_total
return float(is_btax_same and is_atax_same)metric(
ReceiptTotals(before_tax_total=39.23, after_tax_total=45.10),
pred
)1.0
import pixeltable as pxt
from pixeltable import func
from pathlib import Path
pxt.drop_dir('tutoyt', force=True)
pxt.create_dir('tutoyt')
t = pxt.create_table('tutoyt.receipts',{
'receipt_path': pxt.type_system.StringType(nullable=False),
'receipt_image': pxt.Image
},
primary_key = 'receipt_path'
)
for p in Path('images/receipts').glob('*.jpg'):
t.insert(receipt_path=str(p) ,receipt_image=str(p))
t.show(n=3)Created directory 'tutoyt'.
Created table 'receipts'.
Inserting rows into `receipts`: 1 rows [00:00, 24.97 rows/s]
Inserted 1 row with 0 errors.
Inserting rows into `receipts`: 1 rows [00:00, 148.58 rows/s]
Inserted 1 row with 0 errors.
Inserting rows into `receipts`: 1 rows [00:00, 82.38 rows/s]
Inserted 1 row with 0 errors.
Inserting rows into `receipts`: 1 rows [00:00, 180.02 rows/s]
Inserted 1 row with 0 errors.
Inserting rows into `receipts`: 1 rows [00:00, 155.96 rows/s]
Inserted 1 row with 0 errors.
Inserting rows into `receipts`: 1 rows [00:00, 173.20 rows/s]
Inserted 1 row with 0 errors.
Inserting rows into `receipts`: 1 rows [00:00, 118.07 rows/s]
Inserted 1 row with 0 errors.
Inserting rows into `receipts`: 1 rows [00:00, 138.36 rows/s]
Inserted 1 row with 0 errors.
Inserting rows into `receipts`: 1 rows [00:00, 112.65 rows/s]
Inserted 1 row with 0 errors.
Inserting rows into `receipts`: 1 rows [00:00, 197.44 rows/s]
Inserted 1 row with 0 errors.
| receipt_path | receipt_image |
|---|---|
| images/receipts/IMG_2160.jpg | |
| images/receipts/IMG_2163.jpg | |
| images/receipts/IMG_2166.jpg |
def extract_totals(img: Image.Image) -> Dict[str, float]:
"""
Extract the before-tax and after-tax totals from an receipt image.
Parameters
----------
img : PIL.Image.Image
The receipt image (already decoded).
Returns
-------
dict
Keys: ``before_tax_total``, ``after_tax_total`` (as floats).
"""
# --- 1. Encode image ---
buf = io.BytesIO()
img.convert("RGB").save(buf, format="JPEG", quality=95)
b64 = base64.b64encode(buf.getvalue()).decode()
# --- 2. Prompt ---
prompt = (
"Extract the after-tax total and the before-tax total from the receipt.\n"
"Return the values inside these XML tags:\n"
"<before_tax_total>VALUE</before_tax_total>\n"
"<after_tax_total>VALUE</after_tax_total>"
)
# --- 3. LLM call via LiteLLM ---
response = litellm.completion(
model="groq/meta-llama/llama-4-scout-17b-16e-instruct",
messages=[{
"role": "user",
"content": [
{"type": "text", "text": prompt},
{"type": "image_url", "image_url": {
"url": f"data:image/jpeg;base64,{b64}"
}}
]
}],
temperature=0
)
# --- 4. Parse & validate ---
raw = response.choices[0]["message"]["content"]
before = re.search(r"<before_tax_total>(.*?)</before_tax_total>", raw).group(1)
after = re.search(r"<after_tax_total>(.*?)</after_tax_total>", raw).group(1)
return ReceiptTotals(before_tax_total=before, after_tax_total=after).model_dump()extract_totals(Image.open("images/receipts/IMG_2167.jpg")){'before_tax_total': 88.74, 'after_tax_total': 100.4}
@pxt.udf
def extract_totals_udf(img: Image.Image) -> Dict[str, float]:
return extract_totals(img)
t.add_computed_column(extraction = extract_totals_udf(t.receipt_image))Added 10 column values with 0 errors.
10 rows updated, 10 values computed.
t.show(3)| receipt_path | receipt_image | extraction |
|---|---|---|
| images/receipts/IMG_2160.jpg | {"after_tax_total": 45.1, "before_tax_total": 45.1} | |
| images/receipts/IMG_2163.jpg |