Skip to content
SIE

Python SDK Reference

The SDK provides synchronous and async clients for interacting with the SIE server.

Terminal window
pip install sie-sdk

Synchronous client for SIE server.

from sie_sdk import SIEClient
client = SIEClient(
base_url: str, # Server URL (e.g., "http://localhost:8080")
timeout_s: float = 30.0, # Request timeout in seconds
api_key: str | None = None, # API key for authentication
gpu: str | None = None, # Default GPU type for routing
options: dict | None = None, # Default options for all requests
pool: PoolSpec | None = None, # Resource pool configuration
)

Generate embeddings.

def encode(
model: str, # Model name
items: Item | list[Item], # Items to encode
*,
output_types: list[str] | None = None, # ['dense', 'sparse', 'multivector']
instruction: str | None = None, # Task instruction for instruction-tuned models
output_dtype: str | None = None, # 'float32', 'float16', 'int8', 'binary'
is_query: bool | None = None, # Query vs document encoding
options: dict | None = None, # Runtime options (e.g., {"profile": "sparse"})
gpu: str | None = None, # GPU routing
wait_for_capacity: bool = False, # Wait for scale-up
provision_timeout_s: float | None = None, # Max wait time
) -> EncodeResult | list[EncodeResult]

Returns: Single EncodeResult if single item passed, otherwise list.

Example:

# Single item
result = client.encode("BAAI/bge-m3", Item(text="Hello"))
print(result["dense"][:5])
# Batch
results = client.encode("BAAI/bge-m3", [
Item(text="First"),
Item(text="Second"),
])

Rerank items against a query using a cross-encoder or late interaction model. Returns items sorted by relevance score (highest first).

def score(
model: str, # Model name (e.g., "BAAI/bge-reranker-v2-m3")
query: Item, # Query item with text or multivector
items: list[Item], # Items to score against query
*,
instruction: str | None = None, # Optional instruction for instruction-tuned models
options: dict | None = None,
gpu: str | None = None,
wait_for_capacity: bool = False,
provision_timeout_s: float | None = None,
) -> ScoreResult

Example:

result = client.score(
"BAAI/bge-reranker-v2-m3",
query=Item(text="What is Python?"),
items=[Item(text="Python is..."), Item(text="Java is...")]
)
# Scores are sorted by relevance (rank 0 = most relevant)
for entry in result["scores"]:
print(f"Rank {entry['rank']}: {entry['score']:.3f}")

Note: For ColBERT-style models, you can pass pre-computed multivectors to score client-side without a server round-trip. See the Scoring Utilities section.

Extract entities or structured data from text. Supports Named Entity Recognition (NER) models like GLiNER.

def extract(
model: str, # Model name (e.g., "urchade/gliner_multi-v2.1")
items: Item | list[Item], # Items to extract from
*,
labels: list[str] | None = None, # Entity types to extract (e.g., ["person", "org"])
output_schema: dict | None = None, # JSON schema for structured extraction
instruction: str | None = None,
options: dict | None = None,
gpu: str | None = None,
wait_for_capacity: bool = False,
provision_timeout_s: float | None = None,
) -> ExtractResult | list[ExtractResult]

Returns: Single ExtractResult if single item passed, otherwise list.

Example:

result = client.extract(
"urchade/gliner_multi-v2.1",
Item(text="Tim Cook leads Apple."),
labels=["person", "organization"]
)
for entity in result["entities"]:
print(f"{entity['label']}: {entity['text']} (score: {entity['score']:.2f})")
# Output:
# person: Tim Cook (score: 0.95)
# organization: Apple (score: 0.92)

Get available models.

def list_models() -> list[ModelInfo]

Get cluster capacity information.

def get_capacity(
*,
gpu: str | None = None,
) -> CapacityInfo

Wait for GPU capacity to become available. This is useful for pre-warming the cluster before running benchmarks.

def wait_for_capacity(
gpu: str,
*,
model: str | None = None, # If provided, sends a warmup encode request
timeout_s: float | None = None, # Default: 300 seconds
poll_interval_s: float = 5.0,
) -> CapacityInfo

Close the HTTP client.

def close() -> None
with SIEClient("http://localhost:8080") as client:
result = client.encode("BAAI/bge-m3", Item(text="Hello"))
# Client automatically closed

Async client with identical API. All methods are coroutines.

from sie_sdk import SIEAsyncClient
async with SIEAsyncClient("http://localhost:8080") as client:
result = await client.encode("BAAI/bge-m3", Item(text="Hello"))

Input item for encode, score, and extract operations. Most models only use text, but multimodal models can process images.

from sie_sdk.types import Item
class Item(TypedDict, total=False):
id: str # Client-provided ID (echoed in response)
text: str # Text content
images: list[ImageInput] # Image data (for multimodal models)
audio: AudioInput # Audio data
video: VideoInput # Video data
metadata: dict[str, Any] # Custom metadata
multivector: NDArray # Pre-computed vectors (for client-side MaxSim)

Common patterns:

# Simple text
Item(text="Hello world")
# With ID for tracking
Item(id="doc-1", text="Document text")
# Multimodal (for CLIP, ColPali, etc.)
Item(text="Description", images=["photo.jpg"])
class ImageInput(TypedDict, total=False):
data: Image.Image | NDArray | bytes | str | Path # Image in various formats
format: str # 'jpeg', 'png' - inferred if not provided

The SDK accepts various image formats and converts them to JPEG bytes for transport:

  • PIL.Image - Converted to JPEG
  • np.ndarray - Converted via PIL to JPEG
  • bytes - Passed through as-is
  • str or Path - Loaded from file path
class EncodeResult(TypedDict, total=False):
id: str # Echoed item ID
dense: NDArray[np.float32] # Dense embedding
sparse: SparseResult # Sparse embedding
multivector: NDArray[np.float32] # Per-token embeddings
timing: TimingInfo # Timing breakdown
class SparseResult(TypedDict):
indices: NDArray[np.int32] # Token IDs
values: NDArray[np.float32] # Token weights
class ScoreResult(TypedDict, total=False):
model: str # Model used for scoring
query_id: str # Query ID (if provided in request)
scores: list[ScoreEntry] # Sorted by score descending
class ScoreEntry(TypedDict):
item_id: str # ID of the item
score: float # Relevance score
rank: int # Position (0 = most relevant)
class ExtractResult(TypedDict, total=False):
id: str
entities: list[Entity]
relations: list[Relation]
classifications: list[Classification]
objects: list[DetectedObject]
data: dict[str, Any]
class Entity(TypedDict, total=False):
text: str # Extracted span
label: str # Entity type
score: float # Confidence (0-1)
start: int | None # Start character offset
end: int | None # End character offset
bbox: list[int] | None # Bounding box [x, y, width, height] for vision models
class Relation(TypedDict):
head: str # Source entity
tail: str # Target entity
relation: str # Relation type
score: float
class Classification(TypedDict):
label: str
score: float
class ModelInfo(TypedDict, total=False):
name: str # Model name/identifier
loaded: bool # Whether model weights are in memory
inputs: list[str] # Input types: ["text"], ["text", "image"], etc.
outputs: list[str] # Output types: ["dense"], ["dense", "sparse"], etc.
dims: ModelDims # Dimension info for each output type
max_sequence_length: int # Maximum input sequence length
class TimingInfo(TypedDict, total=False):
total_ms: float # Total request time
queue_ms: float # Time waiting in queue
tokenization_ms: float # Tokenization time
inference_ms: float # Model inference time

Client-side scoring for multi-vector embeddings.

Compute MaxSim scores for ColBERT-style retrieval. MaxSim finds the maximum similarity between each query token and any document token, then sums these maximums.

from sie_sdk.scoring import maxsim
def maxsim(
query: NDArray[np.float32], # [num_query_tokens, dim]
documents: list[NDArray[np.float32]] | NDArray[np.float32], # List of [num_doc_tokens, dim]
) -> list[float]

Example:

from sie_sdk.scoring import maxsim
# Encode query with is_query=True for ColBERT models
query_result = client.encode(
"jinaai/jina-colbert-v2",
Item(text="What is ColBERT?"),
output_types=["multivector"],
is_query=True, # Query mode for late interaction models
)
# Encode documents (no is_query needed for documents)
doc_results = client.encode(
"jinaai/jina-colbert-v2",
documents,
output_types=["multivector"]
)
# Compute MaxSim scores client-side
query_mv = query_result["multivector"]
doc_mvs = [r["multivector"] for r in doc_results]
scores = maxsim(query_mv, doc_mvs)
# Rank by score (higher is more relevant)
ranked = sorted(enumerate(scores), key=lambda x: -x[1])

Batch version for multiple queries.

def maxsim_batch(
queries: list[NDArray[np.float32]],
documents: list[NDArray[np.float32]],
) -> NDArray[np.float32] # [num_queries, num_documents]

Exception hierarchy for SDK errors.

Base class for all SDK errors.

class SIEError(Exception):
pass

Cannot connect to server.

class SIEConnectionError(SIEError):
message: str

Invalid request (4xx responses).

class RequestError(SIEError):
code: str
status_code: int

Server error (5xx responses).

class ServerError(SIEError):
code: str
status_code: int

No capacity available or timeout waiting for scale-up.

class ProvisioningError(SIEError):
gpu: str
retry_after: float | None

Resource pool operation failed.

class PoolError(SIEError):
pool_name: str
state: str

LoRA adapter loading timeout.

class LoraLoadingError(SIEError):
lora: str
model: str
from sie_sdk import SIEClient
from sie_sdk.client.errors import RequestError, ProvisioningError
client = SIEClient("http://localhost:8080")
try:
result = client.encode("unknown-model", Item(text="test"))
except RequestError as e:
print(f"Invalid request: {e.code} ({e.status_code})")
except ProvisioningError as e:
print(f"No capacity for GPU {e.gpu}, retry after {e.retry_after}s")

For cluster deployments with multiple GPU types, specify the target GPU:

# Per-request GPU selection
result = client.encode(
"BAAI/bge-m3",
items,
gpu="a100-80gb"
)
# Default GPU for all requests
client = SIEClient(
"http://router.example.com",
gpu="l4"
)

Available GPU types depend on your cluster configuration.


Create isolated worker sets for testing or tenant isolation:

# Create a pool explicitly
client = SIEClient("http://router.example.com")
client.create_pool("my-test-pool", {"l4": 2, "a100-40gb": 1})
# Route requests to the pool
result = client.encode("BAAI/bge-m3", items, gpu="my-test-pool/l4")
# Check pool status
info = client.get_pool("my-test-pool")
print(f"Pool state: {info['status']['state']}, workers: {len(info['status']['assigned_workers'])}")
# Clean up
client.delete_pool("my-test-pool")

The SDK accepts multiple image input formats and converts them to JPEG bytes for transport:

from PIL import Image
import numpy as np
# From PIL Image (recommended)
pil_image = Image.open("photo.jpg")
result = client.encode("openai/clip-vit-base-patch32", Item(
images=[{"data": pil_image}]
))
# From file path (string or Path)
result = client.encode("openai/clip-vit-base-patch32", Item(
images=[{"data": "photo.jpg"}]
))
# From bytes (passed through as-is)
result = client.encode("openai/clip-vit-base-patch32", Item(
images=[{"data": image_bytes, "format": "jpeg"}]
))
# From numpy array (H, W, C)
result = client.encode("openai/clip-vit-base-patch32", Item(
images=[{"data": np.array(pil_image)}]
))
# Direct image input (shorthand without dict wrapper)
result = client.encode("openai/clip-vit-base-patch32", Item(
images=[pil_image, "photo2.jpg"] # Mixed formats supported
))

from sie_sdk import SIEClient
from sie_sdk.types import Item
from sie_sdk.scoring import maxsim
# Initialize client
client = SIEClient("http://localhost:8080", timeout_s=60.0)
# Dense embeddings
embeddings = client.encode(
"BAAI/bge-m3",
[Item(id=f"doc-{i}", text=doc) for i, doc in enumerate(documents)]
)
# Store in vector database
for result in embeddings:
vector_db.insert(result["id"], result["dense"])
# Query with reranking
query = Item(text="What is machine learning?")
# Stage 1: Vector search
query_emb = client.encode("BAAI/bge-m3", query, is_query=True)
candidates = vector_db.search(query_emb["dense"], top_k=100)
# Stage 2: Rerank
rerank_result = client.score(
"BAAI/bge-reranker-v2-m3",
query,
[Item(id=c.id, text=c.text) for c in candidates]
)
# Top 10 results
top_10 = rerank_result["scores"][:10]
# Clean up
client.close()