Python SDK Reference
The SDK provides synchronous and async clients for interacting with the SIE server.
Installation
Section titled “Installation”pip install sie-sdkSIEClient
Section titled “SIEClient”Synchronous client for SIE server.
Constructor
Section titled “Constructor”from sie_sdk import SIEClient
client = SIEClient( base_url: str, # Server URL (e.g., "http://localhost:8080") timeout_s: float = 30.0, # Request timeout in seconds api_key: str | None = None, # API key for authentication gpu: str | None = None, # Default GPU type for routing options: dict | None = None, # Default options for all requests pool: PoolSpec | None = None, # Resource pool configuration)Methods
Section titled “Methods”encode()
Section titled “encode()”Generate embeddings.
def encode( model: str, # Model name items: Item | list[Item], # Items to encode *, output_types: list[str] | None = None, # ['dense', 'sparse', 'multivector'] instruction: str | None = None, # Task instruction for instruction-tuned models output_dtype: str | None = None, # 'float32', 'float16', 'int8', 'binary' is_query: bool | None = None, # Query vs document encoding options: dict | None = None, # Runtime options (e.g., {"profile": "sparse"}) gpu: str | None = None, # GPU routing wait_for_capacity: bool = False, # Wait for scale-up provision_timeout_s: float | None = None, # Max wait time) -> EncodeResult | list[EncodeResult]Returns: Single EncodeResult if single item passed, otherwise list.
Example:
# Single itemresult = client.encode("BAAI/bge-m3", Item(text="Hello"))print(result["dense"][:5])
# Batchresults = client.encode("BAAI/bge-m3", [ Item(text="First"), Item(text="Second"),])score()
Section titled “score()”Rerank items against a query using a cross-encoder or late interaction model. Returns items sorted by relevance score (highest first).
def score( model: str, # Model name (e.g., "BAAI/bge-reranker-v2-m3") query: Item, # Query item with text or multivector items: list[Item], # Items to score against query *, instruction: str | None = None, # Optional instruction for instruction-tuned models options: dict | None = None, gpu: str | None = None, wait_for_capacity: bool = False, provision_timeout_s: float | None = None,) -> ScoreResultExample:
result = client.score( "BAAI/bge-reranker-v2-m3", query=Item(text="What is Python?"), items=[Item(text="Python is..."), Item(text="Java is...")])
# Scores are sorted by relevance (rank 0 = most relevant)for entry in result["scores"]: print(f"Rank {entry['rank']}: {entry['score']:.3f}")Note: For ColBERT-style models, you can pass pre-computed multivectors to score client-side without a server round-trip. See the Scoring Utilities section.
extract()
Section titled “extract()”Extract entities or structured data from text. Supports Named Entity Recognition (NER) models like GLiNER.
def extract( model: str, # Model name (e.g., "urchade/gliner_multi-v2.1") items: Item | list[Item], # Items to extract from *, labels: list[str] | None = None, # Entity types to extract (e.g., ["person", "org"]) output_schema: dict | None = None, # JSON schema for structured extraction instruction: str | None = None, options: dict | None = None, gpu: str | None = None, wait_for_capacity: bool = False, provision_timeout_s: float | None = None,) -> ExtractResult | list[ExtractResult]Returns: Single ExtractResult if single item passed, otherwise list.
Example:
result = client.extract( "urchade/gliner_multi-v2.1", Item(text="Tim Cook leads Apple."), labels=["person", "organization"])
for entity in result["entities"]: print(f"{entity['label']}: {entity['text']} (score: {entity['score']:.2f})")# Output:# person: Tim Cook (score: 0.95)# organization: Apple (score: 0.92)list_models()
Section titled “list_models()”Get available models.
def list_models() -> list[ModelInfo]get_capacity()
Section titled “get_capacity()”Get cluster capacity information.
def get_capacity( *, gpu: str | None = None,) -> CapacityInfowait_for_capacity()
Section titled “wait_for_capacity()”Wait for GPU capacity to become available. This is useful for pre-warming the cluster before running benchmarks.
def wait_for_capacity( gpu: str, *, model: str | None = None, # If provided, sends a warmup encode request timeout_s: float | None = None, # Default: 300 seconds poll_interval_s: float = 5.0,) -> CapacityInfoclose()
Section titled “close()”Close the HTTP client.
def close() -> NoneContext Manager
Section titled “Context Manager”with SIEClient("http://localhost:8080") as client: result = client.encode("BAAI/bge-m3", Item(text="Hello"))# Client automatically closedSIEAsyncClient
Section titled “SIEAsyncClient”Async client with identical API. All methods are coroutines.
from sie_sdk import SIEAsyncClient
async with SIEAsyncClient("http://localhost:8080") as client: result = await client.encode("BAAI/bge-m3", Item(text="Hello"))Input item for encode, score, and extract operations. Most models only use text, but multimodal models can process images.
from sie_sdk.types import Item
class Item(TypedDict, total=False): id: str # Client-provided ID (echoed in response) text: str # Text content images: list[ImageInput] # Image data (for multimodal models) audio: AudioInput # Audio data video: VideoInput # Video data metadata: dict[str, Any] # Custom metadata multivector: NDArray # Pre-computed vectors (for client-side MaxSim)Common patterns:
# Simple textItem(text="Hello world")
# With ID for trackingItem(id="doc-1", text="Document text")
# Multimodal (for CLIP, ColPali, etc.)Item(text="Description", images=["photo.jpg"])ImageInput
Section titled “ImageInput”class ImageInput(TypedDict, total=False): data: Image.Image | NDArray | bytes | str | Path # Image in various formats format: str # 'jpeg', 'png' - inferred if not providedThe SDK accepts various image formats and converts them to JPEG bytes for transport:
PIL.Image- Converted to JPEGnp.ndarray- Converted via PIL to JPEGbytes- Passed through as-isstrorPath- Loaded from file path
EncodeResult
Section titled “EncodeResult”class EncodeResult(TypedDict, total=False): id: str # Echoed item ID dense: NDArray[np.float32] # Dense embedding sparse: SparseResult # Sparse embedding multivector: NDArray[np.float32] # Per-token embeddings timing: TimingInfo # Timing breakdownSparseResult
Section titled “SparseResult”class SparseResult(TypedDict): indices: NDArray[np.int32] # Token IDs values: NDArray[np.float32] # Token weightsScoreResult
Section titled “ScoreResult”class ScoreResult(TypedDict, total=False): model: str # Model used for scoring query_id: str # Query ID (if provided in request) scores: list[ScoreEntry] # Sorted by score descendingScoreEntry
Section titled “ScoreEntry”class ScoreEntry(TypedDict): item_id: str # ID of the item score: float # Relevance score rank: int # Position (0 = most relevant)ExtractResult
Section titled “ExtractResult”class ExtractResult(TypedDict, total=False): id: str entities: list[Entity] relations: list[Relation] classifications: list[Classification] objects: list[DetectedObject] data: dict[str, Any]Entity
Section titled “Entity”class Entity(TypedDict, total=False): text: str # Extracted span label: str # Entity type score: float # Confidence (0-1) start: int | None # Start character offset end: int | None # End character offset bbox: list[int] | None # Bounding box [x, y, width, height] for vision modelsRelation
Section titled “Relation”class Relation(TypedDict): head: str # Source entity tail: str # Target entity relation: str # Relation type score: floatClassification
Section titled “Classification”class Classification(TypedDict): label: str score: floatModelInfo
Section titled “ModelInfo”class ModelInfo(TypedDict, total=False): name: str # Model name/identifier loaded: bool # Whether model weights are in memory inputs: list[str] # Input types: ["text"], ["text", "image"], etc. outputs: list[str] # Output types: ["dense"], ["dense", "sparse"], etc. dims: ModelDims # Dimension info for each output type max_sequence_length: int # Maximum input sequence lengthTimingInfo
Section titled “TimingInfo”class TimingInfo(TypedDict, total=False): total_ms: float # Total request time queue_ms: float # Time waiting in queue tokenization_ms: float # Tokenization time inference_ms: float # Model inference timeScoring Utilities
Section titled “Scoring Utilities”Client-side scoring for multi-vector embeddings.
maxsim()
Section titled “maxsim()”Compute MaxSim scores for ColBERT-style retrieval. MaxSim finds the maximum similarity between each query token and any document token, then sums these maximums.
from sie_sdk.scoring import maxsim
def maxsim( query: NDArray[np.float32], # [num_query_tokens, dim] documents: list[NDArray[np.float32]] | NDArray[np.float32], # List of [num_doc_tokens, dim]) -> list[float]Example:
from sie_sdk.scoring import maxsim
# Encode query with is_query=True for ColBERT modelsquery_result = client.encode( "jinaai/jina-colbert-v2", Item(text="What is ColBERT?"), output_types=["multivector"], is_query=True, # Query mode for late interaction models)
# Encode documents (no is_query needed for documents)doc_results = client.encode( "jinaai/jina-colbert-v2", documents, output_types=["multivector"])
# Compute MaxSim scores client-sidequery_mv = query_result["multivector"]doc_mvs = [r["multivector"] for r in doc_results]scores = maxsim(query_mv, doc_mvs)
# Rank by score (higher is more relevant)ranked = sorted(enumerate(scores), key=lambda x: -x[1])maxsim_batch()
Section titled “maxsim_batch()”Batch version for multiple queries.
def maxsim_batch( queries: list[NDArray[np.float32]], documents: list[NDArray[np.float32]],) -> NDArray[np.float32] # [num_queries, num_documents]Errors
Section titled “Errors”Exception hierarchy for SDK errors.
SIEError
Section titled “SIEError”Base class for all SDK errors.
class SIEError(Exception): passSIEConnectionError
Section titled “SIEConnectionError”Cannot connect to server.
class SIEConnectionError(SIEError): message: strRequestError
Section titled “RequestError”Invalid request (4xx responses).
class RequestError(SIEError): code: str status_code: intServerError
Section titled “ServerError”Server error (5xx responses).
class ServerError(SIEError): code: str status_code: intProvisioningError
Section titled “ProvisioningError”No capacity available or timeout waiting for scale-up.
class ProvisioningError(SIEError): gpu: str retry_after: float | NonePoolError
Section titled “PoolError”Resource pool operation failed.
class PoolError(SIEError): pool_name: str state: strLoraLoadingError
Section titled “LoraLoadingError”LoRA adapter loading timeout.
class LoraLoadingError(SIEError): lora: str model: strHandling Errors
Section titled “Handling Errors”from sie_sdk import SIEClientfrom sie_sdk.client.errors import RequestError, ProvisioningError
client = SIEClient("http://localhost:8080")
try: result = client.encode("unknown-model", Item(text="test"))except RequestError as e: print(f"Invalid request: {e.code} ({e.status_code})")except ProvisioningError as e: print(f"No capacity for GPU {e.gpu}, retry after {e.retry_after}s")GPU Routing
Section titled “GPU Routing”For cluster deployments with multiple GPU types, specify the target GPU:
# Per-request GPU selectionresult = client.encode( "BAAI/bge-m3", items, gpu="a100-80gb")
# Default GPU for all requestsclient = SIEClient( "http://router.example.com", gpu="l4")Available GPU types depend on your cluster configuration.
Resource Pools
Section titled “Resource Pools”Create isolated worker sets for testing or tenant isolation:
# Create a pool explicitlyclient = SIEClient("http://router.example.com")client.create_pool("my-test-pool", {"l4": 2, "a100-40gb": 1})
# Route requests to the poolresult = client.encode("BAAI/bge-m3", items, gpu="my-test-pool/l4")
# Check pool statusinfo = client.get_pool("my-test-pool")print(f"Pool state: {info['status']['state']}, workers: {len(info['status']['assigned_workers'])}")
# Clean upclient.delete_pool("my-test-pool")Image Handling
Section titled “Image Handling”The SDK accepts multiple image input formats and converts them to JPEG bytes for transport:
from PIL import Imageimport numpy as np
# From PIL Image (recommended)pil_image = Image.open("photo.jpg")result = client.encode("openai/clip-vit-base-patch32", Item( images=[{"data": pil_image}]))
# From file path (string or Path)result = client.encode("openai/clip-vit-base-patch32", Item( images=[{"data": "photo.jpg"}]))
# From bytes (passed through as-is)result = client.encode("openai/clip-vit-base-patch32", Item( images=[{"data": image_bytes, "format": "jpeg"}]))
# From numpy array (H, W, C)result = client.encode("openai/clip-vit-base-patch32", Item( images=[{"data": np.array(pil_image)}]))
# Direct image input (shorthand without dict wrapper)result = client.encode("openai/clip-vit-base-patch32", Item( images=[pil_image, "photo2.jpg"] # Mixed formats supported))Complete Example
Section titled “Complete Example”from sie_sdk import SIEClientfrom sie_sdk.types import Itemfrom sie_sdk.scoring import maxsim
# Initialize clientclient = SIEClient("http://localhost:8080", timeout_s=60.0)
# Dense embeddingsembeddings = client.encode( "BAAI/bge-m3", [Item(id=f"doc-{i}", text=doc) for i, doc in enumerate(documents)])
# Store in vector databasefor result in embeddings: vector_db.insert(result["id"], result["dense"])
# Query with rerankingquery = Item(text="What is machine learning?")
# Stage 1: Vector searchquery_emb = client.encode("BAAI/bge-m3", query, is_query=True)candidates = vector_db.search(query_emb["dense"], top_k=100)
# Stage 2: Rerankrerank_result = client.score( "BAAI/bge-reranker-v2-m3", query, [Item(id=c.id, text=c.text) for c in candidates])
# Top 10 resultstop_10 = rerank_result["scores"][:10]
# Clean upclient.close()