Quickstart
Basic usage
Section titled “Basic usage”from mada_modelkit import AgentRequest, AgentResponse
# Import a providerfrom mada_modelkit.providers.cloud.openai import OpenAIClient
async def main(): async with OpenAIClient( api_key="sk-...", model="gpt-4o", ) as client: response = await client.send_request( AgentRequest(prompt="Explain TCP in one sentence.") ) print(response.content)Adding middleware
Section titled “Adding middleware”Middleware wraps any client. Stack them in any order:
from mada_modelkit import ( RetryMiddleware, CircuitBreakerMiddleware, CachingMiddleware, TrackingMiddleware,)from mada_modelkit.providers.cloud.openai import OpenAIClient
# Build the stack: tracking → cache → circuit breaker → retry → providerprovider = OpenAIClient(api_key="sk-...", model="gpt-4o")retry = RetryMiddleware(provider, max_retries=3, backoff_base=1.0)circuit = CircuitBreakerMiddleware(retry, failure_threshold=5)cache = CachingMiddleware(circuit, ttl=3600.0)client = TrackingMiddleware(cache)
# Use it like any other clientresponse = await client.send_request(AgentRequest(prompt="Hello"))
# Check statsprint(f"Requests: {client.stats.total_requests}")print(f"Tokens: {client.stats.total_input_tokens + client.stats.total_output_tokens}")Streaming
Section titled “Streaming”from mada_modelkit import AgentRequest
async for chunk in client.send_request_stream(AgentRequest(prompt="Tell a story")): print(chunk.delta, end="", flush=True) if chunk.is_final: print() # newline at endFallback across providers
Section titled “Fallback across providers”from mada_modelkit import FallbackMiddlewarefrom mada_modelkit.providers.cloud.openai import OpenAIClientfrom mada_modelkit.providers.cloud.anthropic import AnthropicClient
primary = OpenAIClient(api_key="sk-...", model="gpt-4o")fallback = AnthropicClient(api_key="sk-ant-...", model="claude-sonnet-4-6")
client = FallbackMiddleware(primary, [fallback])# If OpenAI fails, automatically tries Anthropicresponse = await client.send_request(AgentRequest(prompt="Hello"))Local models
Section titled “Local models”from mada_modelkit.providers.local_server.ollama import OllamaClient
async with OllamaClient(model="llama3.1") as client: response = await client.send_request( AgentRequest(prompt="What is WACP?") ) print(response.content)Native (in-process) inference
Section titled “Native (in-process) inference”from mada_modelkit.providers.native.llamacpp import LlamaCppClient
async with LlamaCppClient(model_path="/path/to/model.gguf") as client: response = await client.send_request( AgentRequest(prompt="Hello", max_tokens=100) ) print(response.content)