Python SDK Reference

The SDK provides synchronous and async clients for interacting with the SIE server.

Installation

pip install sie-sdk

SIEClient

Synchronous client for SIE server.

Constructor

from sie_sdk import SIEClient

client = SIEClient(
    base_url: str,                 # Server URL (e.g., "http://localhost:8080")
    timeout_s: float = 30.0,       # Request timeout in seconds
    api_key: str | None = None,    # API key for authentication
    gpu: str | None = None,        # Default GPU type for routing
    options: dict | None = None,   # Default options for all requests
    pool: PoolSpec | None = None,  # Resource pool configuration
)

Methods

encode()

Generate embeddings.

def encode(
    model: str,                    # Model name
    items: Item | list[Item],      # Items to encode
    *,
    output_types: list[str] | None = None,  # ['dense', 'sparse', 'multivector']
    instruction: str | None = None,         # Task instruction for instruction-tuned models
    output_dtype: str | None = None,        # 'float32', 'float16', 'int8', 'binary'
    is_query: bool | None = None,           # Query vs document encoding
    options: dict | None = None,            # Runtime options (e.g., {"profile": "sparse"})
    gpu: str | None = None,                 # GPU routing
    wait_for_capacity: bool = False,        # Wait for scale-up
    provision_timeout_s: float | None = None,  # Max wait time
) -> EncodeResult | list[EncodeResult]

Returns: Single EncodeResult if single item passed, otherwise list.

Example:

# Single item
result = client.encode("BAAI/bge-m3", Item(text="Hello"))
print(result["dense"][:5])

# Batch
results = client.encode("BAAI/bge-m3", [
    Item(text="First"),
    Item(text="Second"),
])

score()

Rerank items against a query using a cross-encoder or late interaction model. Returns items sorted by relevance score (highest first).

def score(
    model: str,                    # Model name (e.g., "BAAI/bge-reranker-v2-m3")
    query: Item,                   # Query item with text or multivector
    items: list[Item],             # Items to score against query
    *,
    instruction: str | None = None,  # Optional instruction for instruction-tuned models
    options: dict | None = None,
    gpu: str | None = None,
    wait_for_capacity: bool = False,
    provision_timeout_s: float | None = None,
) -> ScoreResult

Example:

result = client.score(
    "BAAI/bge-reranker-v2-m3",
    query=Item(text="What is Python?"),
    items=[Item(text="Python is..."), Item(text="Java is...")]
)

# Scores are sorted by relevance (rank 0 = most relevant)
for entry in result["scores"]:
    print(f"Rank {entry['rank']}: {entry['score']:.3f}")

Note: For ColBERT-style models, you can pass pre-computed multivectors to score client-side without a server round-trip. See the Scoring Utilities section.

extract()

Extract entities or structured data from text. Supports Named Entity Recognition (NER) models like GLiNER.

def extract(
    model: str,                    # Model name (e.g., "urchade/gliner_multi-v2.1")
    items: Item | list[Item],      # Items to extract from
    *,
    labels: list[str] | None = None,       # Entity types to extract (e.g., ["person", "org"])
    output_schema: dict | None = None,     # JSON schema for structured extraction
    instruction: str | None = None,
    options: dict | None = None,
    gpu: str | None = None,
    wait_for_capacity: bool = False,
    provision_timeout_s: float | None = None,
) -> ExtractResult | list[ExtractResult]

Returns: Single ExtractResult if single item passed, otherwise list.

Example:

result = client.extract(
    "urchade/gliner_multi-v2.1",
    Item(text="Tim Cook leads Apple."),
    labels=["person", "organization"]
)

for entity in result["entities"]:
    print(f"{entity['label']}: {entity['text']} (score: {entity['score']:.2f})")
# Output:
# person: Tim Cook (score: 0.95)
# organization: Apple (score: 0.92)

list_models()

Get available models.

def list_models() -> list[ModelInfo]

get_capacity()

Get cluster capacity information.

def get_capacity(
    *,
    gpu: str | None = None,
) -> CapacityInfo

wait_for_capacity()

Wait for GPU capacity to become available. This is useful for pre-warming the cluster before running benchmarks.

def wait_for_capacity(
    gpu: str,
    *,
    model: str | None = None,      # If provided, sends a warmup encode request
    timeout_s: float | None = None, # Default: 300 seconds
    poll_interval_s: float = 5.0,
) -> CapacityInfo

close()

Close the HTTP client.

def close() -> None

Context Manager

with SIEClient("http://localhost:8080") as client:
    result = client.encode("BAAI/bge-m3", Item(text="Hello"))
# Client automatically closed

SIEAsyncClient

Async client with identical API. All methods are coroutines.

from sie_sdk import SIEAsyncClient

async with SIEAsyncClient("http://localhost:8080") as client:
    result = await client.encode("BAAI/bge-m3", Item(text="Hello"))

Types

Item

Input item for encode, score, and extract operations. Most models only use text, but multimodal models can process images.

from sie_sdk.types import Item

class Item(TypedDict, total=False):
    id: str                        # Client-provided ID (echoed in response)
    text: str                      # Text content
    images: list[ImageInput]       # Image data (for multimodal models)
    audio: AudioInput              # Audio data
    video: VideoInput              # Video data
    metadata: dict[str, Any]       # Custom metadata
    multivector: NDArray           # Pre-computed vectors (for client-side MaxSim)

Common patterns:

# Simple text
Item(text="Hello world")

# With ID for tracking
Item(id="doc-1", text="Document text")

# Multimodal (for CLIP, ColPali, etc.)
Item(text="Description", images=["photo.jpg"])

ImageInput

class ImageInput(TypedDict, total=False):
    data: Image.Image | NDArray | bytes | str | Path  # Image in various formats
    format: str                    # 'jpeg', 'png' - inferred if not provided

The SDK accepts various image formats and converts them to JPEG bytes for transport:

PIL.Image - Converted to JPEG
np.ndarray - Converted via PIL to JPEG
bytes - Passed through as-is
str or Path - Loaded from file path

EncodeResult

class EncodeResult(TypedDict, total=False):
    id: str                        # Echoed item ID
    dense: NDArray[np.float32]     # Dense embedding
    sparse: SparseResult           # Sparse embedding
    multivector: NDArray[np.float32]  # Per-token embeddings
    timing: TimingInfo             # Timing breakdown

SparseResult

class SparseResult(TypedDict):
    indices: NDArray[np.int32]     # Token IDs
    values: NDArray[np.float32]    # Token weights

ScoreResult

class ScoreResult(TypedDict, total=False):
    model: str                     # Model used for scoring
    query_id: str                  # Query ID (if provided in request)
    scores: list[ScoreEntry]       # Sorted by score descending

ScoreEntry

class ScoreEntry(TypedDict):
    item_id: str                   # ID of the item
    score: float                   # Relevance score
    rank: int                      # Position (0 = most relevant)

ExtractResult

class ExtractResult(TypedDict, total=False):
    id: str
    entities: list[Entity]
    relations: list[Relation]
    classifications: list[Classification]
    objects: list[DetectedObject]
    data: dict[str, Any]

Entity

class Entity(TypedDict, total=False):
    text: str                      # Extracted span
    label: str                     # Entity type
    score: float                   # Confidence (0-1)
    start: int | None              # Start character offset
    end: int | None                # End character offset
    bbox: list[int] | None         # Bounding box [x, y, width, height] for vision models

Relation

class Relation(TypedDict):
    head: str                      # Source entity
    tail: str                      # Target entity
    relation: str                  # Relation type
    score: float

Classification

class Classification(TypedDict):
    label: str
    score: float

ModelInfo

class ModelInfo(TypedDict, total=False):
    name: str                      # Model name/identifier
    loaded: bool                   # Whether model weights are in memory
    inputs: list[str]              # Input types: ["text"], ["text", "image"], etc.
    outputs: list[str]             # Output types: ["dense"], ["dense", "sparse"], etc.
    dims: ModelDims                # Dimension info for each output type
    max_sequence_length: int       # Maximum input sequence length

TimingInfo

class TimingInfo(TypedDict, total=False):
    total_ms: float                # Total request time
    queue_ms: float                # Time waiting in queue
    tokenization_ms: float         # Tokenization time
    inference_ms: float            # Model inference time

Scoring Utilities

Client-side scoring for multi-vector embeddings.

maxsim()

Compute MaxSim scores for ColBERT-style retrieval. MaxSim finds the maximum similarity between each query token and any document token, then sums these maximums.

from sie_sdk.scoring import maxsim

def maxsim(
    query: NDArray[np.float32],    # [num_query_tokens, dim]
    documents: list[NDArray[np.float32]] | NDArray[np.float32],  # List of [num_doc_tokens, dim]
) -> list[float]

Example:

from sie_sdk.scoring import maxsim

# Encode query with is_query=True for ColBERT models
query_result = client.encode(
    "jinaai/jina-colbert-v2",
    Item(text="What is ColBERT?"),
    output_types=["multivector"],
    is_query=True,  # Query mode for late interaction models
)

# Encode documents (no is_query needed for documents)
doc_results = client.encode(
    "jinaai/jina-colbert-v2",
    documents,
    output_types=["multivector"]
)

# Compute MaxSim scores client-side
query_mv = query_result["multivector"]
doc_mvs = [r["multivector"] for r in doc_results]
scores = maxsim(query_mv, doc_mvs)

# Rank by score (higher is more relevant)
ranked = sorted(enumerate(scores), key=lambda x: -x[1])

maxsim_batch()

Batch version for multiple queries.

def maxsim_batch(
    queries: list[NDArray[np.float32]],
    documents: list[NDArray[np.float32]],
) -> NDArray[np.float32]  # [num_queries, num_documents]

Errors

Exception hierarchy for SDK errors.

SIEError

Base class for all SDK errors.

class SIEError(Exception):
    pass

SIEConnectionError

Cannot connect to server.

class SIEConnectionError(SIEError):
    message: str

RequestError

Invalid request (4xx responses).

class RequestError(SIEError):
    code: str
    status_code: int

ServerError

Server error (5xx responses).

class ServerError(SIEError):
    code: str
    status_code: int

ProvisioningError

No capacity available or timeout waiting for scale-up.

class ProvisioningError(SIEError):
    gpu: str
    retry_after: float | None

PoolError

Resource pool operation failed.

class PoolError(SIEError):
    pool_name: str
    state: str

LoraLoadingError

LoRA adapter loading timeout.

class LoraLoadingError(SIEError):
    lora: str
    model: str

Handling Errors

from sie_sdk import SIEClient
from sie_sdk.client.errors import RequestError, ProvisioningError

client = SIEClient("http://localhost:8080")

try:
    result = client.encode("unknown-model", Item(text="test"))
except RequestError as e:
    print(f"Invalid request: {e.code} ({e.status_code})")
except ProvisioningError as e:
    print(f"No capacity for GPU {e.gpu}, retry after {e.retry_after}s")

GPU Routing

For cluster deployments with multiple GPU types, specify the target GPU:

# Per-request GPU selection
result = client.encode(
    "BAAI/bge-m3",
    items,
    gpu="a100-80gb"
)

# Default GPU for all requests
client = SIEClient(
    "http://router.example.com",
    gpu="l4"
)

Available GPU types depend on your cluster configuration.

Resource Pools

Create isolated worker sets for testing or tenant isolation:

# Create a pool explicitly
client = SIEClient("http://router.example.com")
client.create_pool("my-test-pool", {"l4": 2, "a100-40gb": 1})

# Route requests to the pool
result = client.encode("BAAI/bge-m3", items, gpu="my-test-pool/l4")

# Check pool status
info = client.get_pool("my-test-pool")
print(f"Pool state: {info['status']['state']}, workers: {len(info['status']['assigned_workers'])}")

# Clean up
client.delete_pool("my-test-pool")

Image Handling

The SDK accepts multiple image input formats and converts them to JPEG bytes for transport:

from PIL import Image
import numpy as np

# From PIL Image (recommended)
pil_image = Image.open("photo.jpg")
result = client.encode("openai/clip-vit-base-patch32", Item(
    images=[{"data": pil_image}]
))

# From file path (string or Path)
result = client.encode("openai/clip-vit-base-patch32", Item(
    images=[{"data": "photo.jpg"}]
))

# From bytes (passed through as-is)
result = client.encode("openai/clip-vit-base-patch32", Item(
    images=[{"data": image_bytes, "format": "jpeg"}]
))

# From numpy array (H, W, C)
result = client.encode("openai/clip-vit-base-patch32", Item(
    images=[{"data": np.array(pil_image)}]
))

# Direct image input (shorthand without dict wrapper)
result = client.encode("openai/clip-vit-base-patch32", Item(
    images=[pil_image, "photo2.jpg"]  # Mixed formats supported
))

Complete Example

from sie_sdk import SIEClient
from sie_sdk.types import Item
from sie_sdk.scoring import maxsim

# Initialize client
client = SIEClient("http://localhost:8080", timeout_s=60.0)

# Dense embeddings
embeddings = client.encode(
    "BAAI/bge-m3",
    [Item(id=f"doc-{i}", text=doc) for i, doc in enumerate(documents)]
)

# Store in vector database
for result in embeddings:
    vector_db.insert(result["id"], result["dense"])

# Query with reranking
query = Item(text="What is machine learning?")

# Stage 1: Vector search
query_emb = client.encode("BAAI/bge-m3", query, is_query=True)
candidates = vector_db.search(query_emb["dense"], top_k=100)

# Stage 2: Rerank
rerank_result = client.score(
    "BAAI/bge-reranker-v2-m3",
    query,
    [Item(id=c.id, text=c.text) for c in candidates]
)

# Top 10 results
top_10 = rerank_result["scores"][:10]

# Clean up
client.close()