Skip to main content

Open in Google Colab

Run this exact tutorial interactively in Google Colab

Dataframer SDK Demo

In this notebook we will demonstrate how the Dataframer Python SDK (PIP Package: pydataframer) can be used to generate large amounts of high quality synthetic datasets where each sample size can be arbitrarily large. We will specifically demonstrate how to generate Electronic Health Records (EHR) where each sample represents a patient health record. These can include results of lab tests, diagnoses, medical opinions etc.

Step 1: Install and Setup SDK

Install the Dataframer SDK and a few other useful utilities.
%%capture
%pip install --upgrade pydataframer dotenv pyyaml tenacity
Initialize the Client. A Dataframer API key is required for this step. This can be retrieved by navigating to Account -> Keys -> Copy API Key on the web application.
import os
from google.colab import userdata

os.environ['DATAFRAMER_API_KEY'] = userdata.get('DATAFRAMER_API_KEY')
import io
import os
from datetime import datetime
from pathlib import Path

import dataframer
import requests
import yaml
from dataframer import Dataframer
from tenacity import retry, retry_if_result, stop_never, wait_fixed

# Initialize the Dataframer client
client = Dataframer(
    api_key=os.getenv('DATAFRAMER_API_KEY'),
)

print("Dataframer client initialized successfully")
print(f"Using base URL: {client.base_url}")
print(f"Dataframer SDK version: {dataframer.__version__}")
Dataframer client initialized successfully
Using base URL: https://df-api.dataframer.ai
Dataframer SDK version: 0.3.1

Step 2: Upload data

EHR records are uploaded as multiple folders. Each folder contains multiple files where each file is a patient health record. A health record can be a lab test report, a medical opinion for a doctor on the overall case, a case history etc. The structure of the folders should look like this:
dataset/
├── patient_001/
│   ├── chest_xray_report.txt
│   └── discharge_summary.md
├── patient_002/
│   ├── blood_work.txt
│   └── clinical_notes.md
zip_buffer = io.BytesIO(requests.get("https://drive.google.com/uc?export=download&id=1V4mY8_c5lXHUa9pYxmBbzAFfkK8b-PJk").content)

dataset = client.dataframer.seed_datasets.create_from_zip(
    name=f"patient_dataset_{datetime.now().strftime('%Y%m%d_%H%M%S')}",
    description="Patient record dataset uploaded from Google Drive ZIP",
    zip_file=zip_buffer
)

print(f"Upload complete\nDataset ID: {dataset.id}")

# Store the dataset_id for later use
dataset_id = dataset.id
Upload complete
Dataset ID: 8cb54375-63ce-485b-b382-561f7239064d

List All Datasets

This API allows you to list all datasets that have been uploaded across your entire company.
# List all datasets to verify creation
datasets = client.dataframer.seed_datasets.list()

print("=" * 80)
print(f"Found {len(datasets)} dataset(s)")
print("=" * 80)

for i, ds in enumerate(datasets, 1):
    print(f"\nDataset {i}:")
    print(f"  Name: {ds.name}")
    print(f"  ID: {ds.id}")
    print(f"  Type: {ds.dataset_type}")
    print(f"  Files: {ds.file_count} | Folders: {ds.folder_count}")
    print(f"  Created: {ds.created_at.strftime('%Y-%m-%d %H:%M:%S')}")
Found 1 dataset(s)

Dataset 1:
  Name: patient_dataset_1
  ID: 8cb54375-63ce-485b-b382-561f7239064d
  Type: MULTI_FILE
  Files: 2 | Folders: 0
  Created: 2025-12-09 23:45:06

Retrieve Dataset Details

This API demonstrates how to retrieve a specific dataset given a dataset ID.
# Get detailed information about the dataset
dataset_info = client.dataframer.seed_datasets.retrieve(dataset_id=dataset_id)

print("Dataset Information:")
print("=" * 80)
print(f"ID: {dataset_info.id}")
print(f"Name: {dataset_info.name}")
print(f"Type: {dataset_info.dataset_type}")
print(f"Description: {dataset_info.description}")
print(f"Created: {dataset_info.created_at}")
print()
print(f"Contents:")
print(f"  Files: {dataset_info.file_count}")
print(f"  Folders: {dataset_info.folder_count}")
print("=" * 80)
Dataset Information:
================================================================================
ID: 8cb54375-63ce-485b-b382-561f7239064d
Name: patient_dataset_1
Type: MULTI_FILE
Description: Patient record dataset uploaded from Google Drive ZIP
Created: 2025-12-09 23:45:06.348298+00:00

Contents:
  Files: 2
  Folders: 0
================================================================================

Step 3: Generate Specification via AI Analysis

A specification (or “spec”) is a detailed description that captures the structure, patterns, and requirements of your data. Dataframer automatically generates specifications by analyzing your seed data.
# --- Start analysis ---
spec_name = f"spec_for_dataset_{dataset_id}_{datetime.now().strftime('%Y%m%d_%H%M%S')}"
spec = client.dataframer.specs.create(
    dataset_id=dataset_id,
    name=spec_name,
    spec_generation_model_name="anthropic/claude-sonnet-4-6",
    extrapolate_values=True,
    generate_distributions=True,
)

spec_id = spec.id
print(f"Started analysis: {spec_name}")
print(f"Spec ID: {spec_id}")

def spec_not_ready(result):
    return result.status not in ("SUCCEEDED", "FAILED")

@retry(wait=wait_fixed(5), retry=retry_if_result(spec_not_ready), stop=stop_never)
def poll_spec_status(client, spec_id):
    return client.dataframer.specs.retrieve(spec_id=spec_id)

print("Polling for spec status...")
spec_status = poll_spec_status(client, spec_id)

if spec_status.status == "FAILED":
    raise RuntimeError("Spec analysis failed")

print(f"\nAnalysis completed successfully!")
print(f"Spec ID: {spec_id}")
Started analysis: Spec for dataset 8cb54375-63ce-485b-b382-561f7239064d
Spec ID: cd7f343f-4c87-446c-b919-7f4cafb3b6e8

Analysis completed successfully!
Spec ID: cd7f343f-4c87-446c-b919-7f4cafb3b6e8

List All Specs

This API allows you view all the specifications across the entire company.
# Retrieve all specs to get the spec ID
specs = client.dataframer.specs.list()

print("Available specs:")
for spec in specs:
    print(f"  - {spec.name} (ID: {spec.id})")
    print(f"    Dataset: {spec.dataset_name}")
Available specs:
  - Spec for dataset a023c3fb-e6eb-4368-a12a-d3385cb7b6a2 (ID: cce55d46-8741-44a0-9ce0-836961d487f9)
    Dataset: patient_dataset

Review Generated Specification

For the dataset for which a specification was triggered above, view the results of the specification generation via the specs.retrieve(spec_id=spec_id) API.
# Get the spec (latest content_yaml is returned directly)
spec = client.dataframer.specs.retrieve(spec_id=spec_id)

print(f"Content YAML length: {len(spec.content_yaml)} chars")

# Parse the config to see data properties
config = yaml.safe_load(spec.content_yaml)

# Access spec data
spec_data = config.get('spec', config)

print(f"\nData property variations:")
if 'data_property_variations' in spec_data:
    for prop in spec_data['data_property_variations']:
        print(f"  - {prop['property_name']}: {len(prop['property_values'])} values")
Content YAML length: 21232 chars

Data property variations:
  - Primary diagnosis category: 10 values
  - Patient age group: 5 values
  - Patient gender: 2 values
  - Admission acuity and initial location: 7 values
  - Length of stay: 6 values
  - Number of specialists consulted: 5 values
  - Procedures performed during hospitalization: 7 values
  - Complexity of medication changes at discharge: 4 values
  - Social complexity and discharge planning needs: 7 values
  - Number of follow-up appointments scheduled: 4 values
  - Document formatting style: 5 values
  - Hospital course narrative style: 5 values
  - Level of medical abbreviation density: 3 values
  - Inclusion of prognostic information: 3 values
  - Patient education and counseling documentation detail: 3 values
  - Complication or adverse event occurrence: 4 values
  - Diagnostic certainty at discharge: 4 values
  - Severity of primary condition: 4 values
  - Number of secondary diagnoses: 4 values
  - Hospital system/institution name style: 5 values
  - Geographic region indicators: 5 values
  - Race/ethnicity mention: 3 values
  - Chief complaint presentation style: 3 values
  - Lab values presentation format: 4 values
  - Discharge instructions emphasis on warning signs: 4 values
  - Medication list format at discharge: 4 values

Step 4: Update Specification (Optional)

This cell demonstrates how to programmatically update a given specification. Here, we will add a new data property called Case Severity with values 'Critical', 'Severe', 'Moderate', 'Mild' and expected distributions of these values [10, 25, 40, 25]
# Get the spec (latest content_yaml is returned directly)
spec = client.dataframer.specs.retrieve(spec_id=spec_id)

# Parse the current config
current_config = yaml.safe_load(spec.content_yaml)
spec_data = current_config.get('spec', current_config)

# Example: Add a new data property variation for EHR
if 'data_property_variations' in spec_data:
    new_property = {
        'property_name': 'Case Severity',
        'property_values': ['Critical', 'Severe', 'Moderate', 'Mild'],
        'base_distributions': {
            'Critical': 10,
            'Severe': 25,
            'Moderate': 40,
            'Mild': 25
        },
        'conditional_distributions': {}
    }
    spec_data['data_property_variations'].append(new_property)
    print(f"Added new property: {new_property['property_name']}")

    # Update requirements for medical context
    if 'requirements' in spec_data:
        spec_data['requirements'] += (
            "\n\nGenerated patient cases must maintain medical accuracy "
            "and include appropriate clinical correlations between symptoms, "
            "test results, and diagnoses."
        )
        print(f"Updated requirements for medical context")

# Convert back to YAML
new_content_yaml = yaml.dump(current_config, default_flow_style=False, sort_keys=False)

# Update the spec (creates a new version automatically)
updated_spec = client.dataframer.specs.update(
    spec_id=spec_id,
    content_yaml=new_content_yaml,
)

print(f"\nSpec updated successfully")
Added new property: Case Severity
Updated requirements for medical context

Spec updated successfully

Step 5: Generate Multi-Folder Samples

Once the spec is generated and finalized after any manual modifications, we will use this spec to generate synthetic data.
# --- Start generation run ---
run = client.dataframer.runs.create(
    spec_id=spec_id,
    generation_model="anthropic/claude-sonnet-4-6",
    number_of_samples=3,

    outline_model="anthropic/claude-sonnet-4-6-thinking",

    # enable_revisions=True,
    # max_revision_cycles=2,
    # revision_model="anthropic/claude-sonnet-4-6-thinking",
)

run_id = run.id

print("Started generation run")
print(f"Run ID: {run_id}")

def run_not_finished(result):
    return result.status not in ("SUCCEEDED", "FAILED")

@retry(wait=wait_fixed(10), retry=retry_if_result(run_not_finished), stop=stop_never)
def poll_run_status(client, run_id):
    return client.dataframer.runs.retrieve(run_id=run_id)

print("Polling for run status (this usually takes several minutes)...")
run_status = poll_run_status(client, run_id)

if run_status.status == "FAILED":
    raise RuntimeError("Generation failed")

print(f"\nGeneration completed successfully!")
print(f"Run ID: {run_id}")
print(f"Samples completed: {run_status.samples_completed}")
Started generation run
Run ID: 063287b0-d01e-4e14-8fc9-bb0cb3ad3436

Generation completed successfully!
Run ID: 063287b0-d01e-4e14-8fc9-bb0cb3ad3436
Samples completed: 3

Step 6: Evaluate Generated Samples

While Dataframer evaluates each sample as it is generated, it also supports a post-generation evaluation. This API shows how to evaluate the generated dataset. Read the documentation for more details.
# --- Start evaluation ---
print(f"Creating evaluation for run: {run_id}")

evaluation = client.dataframer.evaluations.create(
    run_id=run_id,
    evaluation_model="anthropic/claude-sonnet-4-6"
)

evaluation_id = evaluation.id

print("\nEvaluation created")
print(f"Evaluation ID: {evaluation_id}")
print(f"Created at: {evaluation.created_at}")

def eval_not_finished(result):
    return result.status not in ("SUCCEEDED", "FAILED")

@retry(wait=wait_fixed(5), retry=retry_if_result(eval_not_finished), stop=stop_never)
def poll_eval_status(client, evaluation_id):
    return client.dataframer.evaluations.retrieve(evaluation_id=evaluation_id)

print("Polling for evaluation status...")
eval_status = poll_eval_status(client, evaluation_id)

if eval_status.status == "FAILED":
    print("\nEvaluation failed.")
    if eval_status.error_message:
        print(f"  Error: {eval_status.error_message}")
else:
    print("\nEvaluation completed successfully!")
Creating evaluation for run: 063287b0-d01e-4e14-8fc9-bb0cb3ad3436

Evaluation created
Evaluation ID: c3c00680-ca98-4611-b0f1-ffe7b1b6a478
Created at: 2025-12-09 23:51:21.646228+00:00

Evaluation completed successfully!

Step 7: Download Generated Folders with Metadata

List Generated Files

This API lists all the files that were present in the generated dataset. Generated files are included in the run details response.
# Get generated files from the run details
run_details = client.dataframer.runs.retrieve(run_id=run_id)

print("Generated Files:")
print("=" * 80)
print(f"Run ID: {run_details.id}")
print(f"Total files: {len(run_details.generated_files)}")
print("=" * 80)

for i, file in enumerate(run_details.generated_files, 1):
    print(f"\nFile {i}:")
    print(f"  Path: {file.path}")
    print(f"  ID: {file.id}")
    print(f"  Size: {file.size_bytes} bytes")
    print(f"  Type: {file.file_type}")
Generated Files:
================================================================================
Run ID: 7528a30a-a46a-4042-9b82-8358872d37d9
Total files: 3
================================================================================

File 1:
  Path: generated_sample_1.txt
  ID: txt_sample_1
  Size: 16733 bytes
  Type: text/plain

File 2:
  Path: generated_sample_2.txt
  ID: txt_sample_2
  Size: 14441 bytes
  Type: text/plain

File 3:
  Path: generated_sample_3.txt
  ID: txt_sample_3
  Size: 18705 bytes
  Type: text/plain

Download All Files as ZIP

This API allows you to download all the generated files as a compressed ZIP file. The download is asynchronous - first request triggers ZIP generation, then poll until ready.
print(f"Downloading generated files with metadata as ZIP...")

def download_not_ready(response):
    return not hasattr(response, 'download_url') or response.download_url is None

@retry(wait=wait_fixed(2), retry=retry_if_result(download_not_ready), stop=stop_never,
       before_sleep=lambda rs: print("  ZIP generation in progress, waiting..."))
def poll_download(client, run_id):
    return client.dataframer.runs.files.download_all(run_id=run_id)

response = poll_download(client, run_id)

zip_response = requests.get(response.download_url)
output_file = Path(f"generated_samples_{run_id}.zip")
output_file.write_bytes(zip_response.content)

print(f"\nDownload complete!")
print(f"ZIP file: {output_file.absolute()}")
Downloading generated files with metadata as ZIP...

Download complete!
ZIP file: /content/generated_samples_7528a30a-a46a-4042-9b82-8358872d37d9.zip