Learn how to use the AI Data SDK for your AI applications
AI Data SDK is a comprehensive toolkit for standardizing, processing, embedding, and retrieving data for AI applications. This guide will help you get started with the SDK.
You can install AI Data SDK using pip:
pip install ai-data-sdk-zeebee==0.1.3
Here's a simple example to get you started:
from ai_data_sdk import AIDataClient
# Initialize with your API key
client = AIDataClient(api_key="your_api_key_here")
# Generate embeddings for a list of texts
texts = [
"AI Data SDK helps standardize data for AI applications.",
"The embedding module converts text into vector representations."
]
# Optional metadata for each text
metadata = [
{"source": "documentation", "category": "overview"},
{"source": "documentation", "category": "technical"}
]
# Generate embeddings
result = client.create_embeddings(
texts=texts,
model="text-embedding-3-small",
normalize=True,
metadata=metadata
)
# Search with the generated embedding
search_result = client.search(embedding=result['embeddings'][0], top_k=5)
# Or search directly with a text query
query_result = client.search(query="How do machines learn from data?")
# Advanced search with filters
filters = {
"category": "technology",
"rating": {"$gt": 4.5}
}
search_result = client.search(
query="neural networks",
filters=filters,
top_k=10
)
# Example with PII detection
text = "My email is john.doe@example.com and my phone is 555-123-4567."
pii_result = client.detect_pii(
text=text,
pii_types=["email", "phone"],
mask_pii=True
)
# Advanced PII anonymization
anonymized_result = client.anonymize_pii(
text=text,
pii_types=["email", "phone", "name"],
consistent_replacements=True,
resolve_overlaps=True
)
The SDK consists of the following core modules:
The embedding module provides utilities for generating text embeddings using various language models.
from ai_data_sdk import AIDataClient
# Initialize with your API key
client = AIDataClient(api_key="your_api_key_here")
# Generate embeddings for a list of texts
texts = ["This is a sample text", "Another sample text"]
result = client.create_embeddings(texts=texts)
# Print embedding dimensions
print(f"Generated {len(result['embeddings'])} embeddings with dimension {len(result['embeddings'][0])}")
from ai_data_sdk import AIDataClient
# Initialize client
client = AIDataClient(api_key="your_api_key_here")
# Generate embeddings with metadata and custom model
texts = ["Example text 1", "Example text 2", "Example text 3"]
metadata = [
{"source": "document1", "category": "news"},
{"source": "document2", "category": "blog"},
{"source": "document3", "category": "article"}
]
result = client.create_embeddings(
texts=texts,
model="text-embedding-3-large",
normalize=True,
metadata=metadata
)
# Access the embeddings
embeddings = result['embeddings']
The SDK currently supports the following embedding models:
text-embedding-3-small
(default): OpenAI's newer small
embedding model
text-embedding-3-large
: OpenAI's newer large embedding
model
text-embedding-ada-002
: OpenAI's older general purpose
embedding model
The vector database module provides tools for storing and searching vector embeddings for semantic similarity operations.
from ai_data_sdk import AIDataClient
# Initialize with your API key
client = AIDataClient(api_key="your_api_key_here")
# Basic search with query text
search_result = client.search(query="How do machines learn from data?")
# Search with additional parameters
search_result = client.search(
query="neural networks",
top_k=10
)
# Create an embedding first
texts = ["AI Data SDK helps standardize data for AI applications."]
result = client.create_embeddings(texts=texts)
embedding = result['embeddings'][0]
# Search with the embedding vector directly
search_result = client.search(
embedding=embedding,
top_k=5
)
# Search with metadata filters
filters = {
"category": "technology",
"rating": {"$gt": 4.5}
}
search_result = client.search(
query="neural networks",
filters=filters,
top_k=10
)
# More complex filters using operators
filters = {
"$and": [
{"document_type": "article"},
{"$or": [
{"category": "AI"},
{"category": "machine learning"}
]},
{"created_at": {"$gt": "2024-01-01"}}
]
}
search_result = client.search(
query="transformer models",
filters=filters
)
from ai_data_sdk import APIError, AuthenticationError, InvalidRequestError, RateLimitError
try:
search_result = client.search(query="example query")
except AuthenticationError:
print("Authentication failed. Check your API key.")
except InvalidRequestError as e:
print(f"Invalid request: {e}")
except RateLimitError:
print("Rate limit exceeded. Please try again later.")
except APIError as e:
print(f"API error: {e}")
# Submit feedback to improve future search results
feedback = client.submit_feedback(
query_id="q_12345",
result_id="doc_1",
rating=4, # Rating from 1-5
comments="Very relevant result, but missing some details."
)
The PII detection module provides tools for detecting and masking personally identifiable information (PII) in text.
from ai_data_sdk import AIDataClient
# Initialize with your API key
client = AIDataClient(api_key="your_api_key_here")
# Example text with PII
text = "My email is john.doe@example.com and my phone is 555-123-4567"
# Detect PII
result = client.detect_pii(text=text)
print(f"Detected {len(result['pii_found'])} PII instances")
# Mask PII
masked_result = client.detect_pii(text=text, mask_pii=True)
print(f"Masked text: {masked_result['masked_text']}")
from ai_data_sdk import AIDataClient
# Initialize with your API key
client = AIDataClient(api_key="your_api_key_here")
# Example text with PII
text = "My email is john.doe@example.com and my phone is 555-123-4567"
# Detect only email addresses
email_result = client.detect_pii(text=text, pii_types=["email"])
# Mask only phone numbers
phone_masked_result = client.detect_pii(
text=text,
pii_types=["phone"],
mask_pii=True
)
The PII detector supports automatic resolution of overlapping entities, ensuring that PII is properly identified without duplication or fragmentation.
from ai_data_sdk import AIDataClient
# Initialize with your API key
client = AIDataClient(api_key="your_api_key_here")
# Example text with potentially overlapping PII
text = "John Smith's email is john.smith@example.com"
# Detect PII with overlap resolution
pii_result = client.detect_pii(text=text, resolve_overlaps=True)
# Mask PII with overlap resolution
masked_result = client.detect_pii(
text=text,
mask_pii=True,
resolve_overlaps=True
)
from ai_data_sdk import AIDataClient
# Initialize with your API key
client = AIDataClient(api_key="your_api_key_here")
# Example text with PII
text = "Please contact John Doe at john.doe@example.com or 555-123-4567."
# Anonymize PII with consistent replacements
anonymized_result = client.anonymize_pii(
text=text,
pii_types=["name", "email", "phone"],
consistent_replacements=True,
resolve_overlaps=True
)
print(f"Anonymized text: {anonymized_result['anonymized_text']}")
print(f"Replacement map: {anonymized_result['replacement_map']}")
The SDK detects the following types of PII by default:
The feedback module provides tools for collecting user feedback on search results and improving search quality over time.
from ai_data_sdk import AIDataClient
# Initialize with your API key
client = AIDataClient(api_key="your_api_key_here")
# Submit feedback for a search result
feedback_response = client.submit_feedback(
query_id="q_123",
result_id="doc_456",
rating=4, # 1-5 scale
comments="This result was helpful and relevant to my query"
)
print(f"Feedback submitted with ID: {feedback_response['feedback_id']}")
The feedback collected through the API is automatically processed to improve search quality. The system uses this feedback to:
The AI Data SDK can be used as a library in your Python application or as a REST API service.
The SDK provides a ready-to-use REST API that you can integrate with any application. See the API Documentation for details on available endpoints and request formats.
# Generate a JWT token
POST /api/v1/token
{
"user_id": "your_user_id"
}
# Response
{
"token": "eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...",
"expires_at": "2025-04-25T04:18:53Z"
}
// JavaScript example
const fetchEmbeddings = async (texts) => {
const response = await fetch('/api/v1/embeddings', {
method: 'POST',
headers: {
'Content-Type': 'application/json',
'Authorization': 'Bearer eyJhbGciOiJIUzI1NiIsInR5cCI6IkpXVCJ9...'
},
body: JSON.stringify({
texts: texts
})
});
return await response.json();
};
import requests
class AiDataSdkClient:
def __init__(self, base_url, api_key=None):
self.base_url = base_url
self.api_key = api_key
self.token = None
def authenticate(self, user_id):
response = requests.post(
f"{self.base_url}/api/v1/token",
json={"user_id": user_id}
)
response.raise_for_status()
data = response.json()
self.token = data["token"]
return data
def create_embeddings(self, texts, model=None):
headers = {"Authorization": f"Bearer {self.token}"}
payload = {"texts": texts}
if model:
payload["model"] = model
response = requests.post(
f"{self.base_url}/api/v1/embeddings",
headers=headers,
json=payload
)
response.raise_for_status()
return response.json()
def search(self, query=None, embedding=None, top_k=10, filters=None):
headers = {"Authorization": f"Bearer {self.token}"}
payload = {"top_k": top_k}
if query:
payload["query"] = query
elif embedding:
payload["embedding"] = embedding
else:
raise ValueError("Either query or embedding must be provided")
if filters:
payload["filters"] = filters
response = requests.post(
f"{self.base_url}/api/v1/search",
headers=headers,
json=payload
)
response.raise_for_status()
return response.json()
The AI Data SDK is built with a modular architecture that allows for flexibility and extensibility.
Handles text embedding generation using language models like OpenAI's embedding models. Manages batch processing and error handling.
Provides a unified interface for different vector databases (in-memory, FAISS). Handles vector storage, retrieval, and similarity search.
Identifies and masks personally identifiable information in text data. Features intelligent non-overlapping entity resolution, custom PII patterns, and selective masking for enhanced privacy protection.
Collects user feedback on search results and detects embedding drift over time. Provides analytics on collected feedback.
Standardizes input validation and error handling across all modules. Provides utilities for JSON schema validation and error reporting.
The SDK is designed to be extensible, allowing you to: