Skip to main content

Open in Google Colab

Run this exact tutorial interactively in Google Colab

Dataframer SDK Demo

In this notebook we will demonstrate how the Dataframer Python SDK (PIP Package: pydataframer) can be used to generate large amounts of high quality synthetic datasets where each sample size can be arbitrarily large. We will specifically demonstrate how to generate Electronic Health Records (EHR) where each sample represents a patient health record. These can include results of lab tests, diagnoses, medical opinions etc.

Step 1: Install and Setup SDK

Install the Dataframer SDK and a few other useful utilities.
%%capture
%pip install --upgrade pydataframer dotenv pyyaml
Initialize the Client. A Dataframer API key is required for this step. This can be retrieved by navigating to Account -> Keys -> Copy API Key on the web application.
import os
from google.colab import userdata

os.environ['DATAFRAMER_API_KEY'] = userdata.get('DATAFRAMER_API_KEY')
from pathlib import Path
from dataframer import Dataframer
import os

# Initialize the Dataframer client
client = Dataframer(
    api_key=os.getenv('DATAFRAMER_API_KEY'),
)

print("✓ Dataframer client initialized successfully")
print(f"Using base URL: {client.base_url}")

# Check SDK version
import dataframer
print(f"Dataframer SDK version: {dataframer.__version__}")
✓ Dataframer client initialized successfully
Using base URL: https://df-api.dataframer.ai
Dataframer SDK version: 0.2.1

Step 2: Upload data

EHR records are uploaded as multiple folders. Each folder contains multiple files where each file is a patient health record. A health record can be a lab test report, a medical opinion for a doctor on the overall case, a case history etc. The structure of the folders should look like this:
dataset/
├── patient_001/
│   ├── chest_xray_report.txt
│   └── discharge_summary.md
├── patient_002/
│   ├── blood_work.txt
│   └── clinical_notes.md
import requests, io

file_id = "1nG-UB2YtQesjahBMcEY3Gwrec1_Nz3Nz"
url = f"https://drive.google.com/uc?export=download&id={file_id}"

zip_buffer = io.BytesIO(requests.get(url).content)
zip_buffer.seek(0)

dataset = client.dataframer.datasets.create_from_zip(
    name="patient_dataset",
    description="Patient record dataset uploaded from Google Drive ZIP",
    zip_file=zip_buffer
)

print(f"Upload complete\nDataset ID: {dataset.id}")

# Store the dataset_id for later use
dataset_id = dataset.id
Upload complete Dataset ID: 8cb54375-63ce-485b-b382-561f7239064d

List All Datasets

This API allows you to list all datasets that have been uploaded across your entire company.
# List all datasets to verify creation
datasets = client.dataframer.datasets.list()

print("=" * 80)
print(f"Found {len(datasets)} dataset(s)")
print("=" * 80)

for i, dataset in enumerate(datasets, 1):
    print(f"\n📁 Dataset {i}:")
    print(f"  Name: {dataset.name}")
    print(f"  ID: {dataset.id}")
    print(f"  Type: {dataset.dataset_type_display}")
    print(f"  Files: {dataset.file_count} | Folders: {dataset.folder_count}")
    print(f"  Created: {dataset.created_at.strftime('%Y-%m-%d %H:%M:%S')}")
Found 1 dataset(s) 📁 Dataset 1:
Name: patient_dataset_1
ID: 8cb54375-63ce-485b-b382-561f7239064d
Type: MULTI_FILE
Files: 2 | Folders: 0
Created: 2025-12-09 23:45:06

Retrieve Dataset Details

This API demonstrates how to retrieve a specific dataset given a dataset ID.
# Get detailed information about the dataset
dataset_info = client.dataframer.datasets.retrieve(dataset_id=dataset_id)

print("📋 Dataset Information:")
print("=" * 80)
print(f"ID: {dataset_info.id}")
print(f"Name: {dataset_info.name}")
print(f"Type: {dataset_info.dataset_type} ({dataset_info.dataset_type_display})")
print(f"Description: {dataset_info.description}")
print(f"Created: {dataset_info.created_at}")
print()
print(f"📁 Contents:")
print(f"  Files: {dataset_info.file_count}")
print(f"  Folders: {dataset_info.folder_count}")
print()
print(f"🔧 Compatibility:")
compat = dataset_info.short_sample_compatibility
print(f"  Short samples:  {'✅' if compat.is_short_samples_compatible else '❌'}")
print(f"  Long samples:   {'✅' if compat.is_long_samples_compatible else '❌'}")
if compat.reason:
    print(f"  Reason: {compat.reason}")
print("=" * 80)
📋 Dataset Information: ID: 8cb54375-63ce-485b-b382-561f7239064d
Name: patient_dataset_1
Type: MULTI_FILE (MULTI_FILE)
Description: Patient record dataset uploaded from Google Drive ZIP
Created: 2025-12-09 23:45:06.348298+00:00
📁 Contents:
Files: 2
Folders: 0
🔧 Compatibility:
Short samples: ✅
Long samples: ✅

Step 3: Generate Specification via AI Analysis

A specification (or “spec”) is a detailed description that captures the structure, patterns, and requirements of your data. Dataframer automatically generates specifications by analyzing your seed data.
import time
import sys

# --- Start analysis ---
spec_name = f"Spec for dataset {dataset_id}"
analysis = client.dataframer.analyze.create(
    dataset_id=dataset_id,
    name=spec_name,
    analysis_model_name="claude-sonnet-4-5",
    ## Note: This model can be used to generate evals data but not data to train competing models
    extrapolate_values=True,
    generate_distributions=True,
)

task_id = analysis.task_id
print(f"Started analysis: {spec_name}")
print(f"Task ID: {task_id}")

# --- Polling setup ---
poll_interval = 5
spinner_sleep = 0.1
start_time = time.time()

def show_bar(status, elapsed, poll_interval=5):

    spinner_frames = ["⠋", "⠙", "⠹", "⠸", "⠼", "⠴", "⠦", "⠧", "⠇", "⠏"]
    fps = 10

    frame_index = int(elapsed * fps) % len(spinner_frames)
    spinner = spinner_frames[frame_index]

    elapsed_int = int(elapsed)
    sys.stdout.write(f"\r{spinner} {elapsed_int:3d}s | Status: {status:<10}")
    sys.stdout.flush()


last_poll_time = 0.0
status = {"status": "PENDING"}
state = status["status"]

while True:
    now = time.time()
    elapsed = now - start_time

    if (now - last_poll_time) >= poll_interval or last_poll_time == 0.0:
        status = client.dataframer.analyze.get_status(task_id=task_id)
        state = status["status"]
        last_poll_time = now

    show_bar(state, elapsed, poll_interval=poll_interval)

    if state == "COMPLETED":
        sys.stdout.write("\r" + " " * 80 + "\r")
        sys.stdout.flush()

        print("\nAnalysis completed successfully! ✅")
        print(f"Spec ID: {status.get('spec_id')}")
        break

    if state == "FAILED":
        sys.stdout.write("\r" + " " * 80 + "\r")
        sys.stdout.flush()

        print("❌ Analysis failed.")
        raise RuntimeError(status.get("error", "Unknown error"))

    time.sleep(spinner_sleep)

Started analysis: Spec for dataset 8cb54375-63ce-485b-b382-561f7239064d
Task ID: d44e783e-9bce-48c0-9095-c7a9c729bd5e\
Analysis completed successfully! ✅
Spec ID: cd7f343f-4c87-446c-b919-7f4cafb3b6e8

List All Specs

This API allows you view all the specifications across the entire company.
# Retrieve all specs to get the spec ID
specs = client.dataframer.specs.list()

print("Available specs:")
for spec in specs:
    print(f"  • {spec.name} (ID: {spec.id})")
    print(f"    Dataset: {spec.dataset_name}")

# Store the spec_id for generation
spec_id = specs[0].id  # Use your newly created spec
time.sleep(5)
Available specs: • Spec for dataset a023c3fb-e6eb-4368-a12a-d3385cb7b6a2 (ID: cce55d46-8741-44a0-9ce0-836961d487f9) Dataset: patient_dataset

Review Generated Specification

For the dataset for which a specification was triggered above, view the results of the specification generation via the specs.retrieve(spec_id=spec_id) API.
# Get the latest version details
spec = client.dataframer.specs.retrieve(spec_id=spec_id)
versions = client.dataframer.specs.versions.list(spec_id=spec_id)

if len(versions) > 0:
    latest_version = client.dataframer.specs.versions.retrieve(
        spec_id=spec_id,
        version_id=versions[0].id
    )

    print(f"Latest version: {latest_version.version}")
    print(f"Config YAML length: {len(latest_version.config_yaml)} chars")

    # Parse the config to see data properties
    import yaml
    config = yaml.safe_load(latest_version.config_yaml)

    # Access spec data
    spec_data = config.get('spec', config)

    print(f"\nData property variations:")
    if 'data_property_variations' in spec_data:
        for prop in spec_data['data_property_variations']:
            print(f"  • {prop['property_name']}: {len(prop['property_values'])} values")
Latest version: 1
Config YAML length: 21232 chars
Data property variations:
• Primary diagnosis category: 10 values
• Patient age group: 5 values
• Patient gender: 2 values
• Admission acuity and initial location: 7 values
• Length of stay: 6 values
• Number of specialists consulted: 5 values
• Procedures performed during hospitalization: 7 values
• Complexity of medication changes at discharge: 4 values
• Social complexity and discharge planning needs: 7 values
• Number of follow-up appointments scheduled: 4 values
• Document formatting style: 5 values
• Hospital course narrative style: 5 values
• Level of medical abbreviation density: 3 values
• Inclusion of prognostic information: 3 values
• Patient education and counseling documentation detail: 3 values
• Complication or adverse event occurrence: 4 values
• Diagnostic certainty at discharge: 4 values
• Severity of primary condition: 4 values
• Number of secondary diagnoses: 4 values
• Hospital system/institution name style: 5 values
• Geographic region indicators: 5 values
• Race/ethnicity mention: 3 values
• Chief complaint presentation style: 3 values
• Lab values presentation format: 4 values
• Discharge instructions emphasis on warning signs: 4 values
• Medication list format at discharge: 4 values

Step 4: Update Specification (Optional)

This cell demonstrates how to programmatically update a given specification. Here, we will add a new data property called Case Severity with values 'Critical', 'Severe', 'Moderate', 'Mild' and expected distributions of these values [10, 25, 40, 25]
import yaml

# Get the latest version
versions = client.dataframer.specs.versions.list(spec_id=spec_id)
latest_version = client.dataframer.specs.versions.retrieve(
    spec_id=spec_id,
    version_id=versions[0].id
)

# Parse the current config
current_config = yaml.safe_load(latest_version.config_yaml)
spec_data = current_config.get('spec', current_config)

# Example: Add a new data property variation for EHR
if 'data_property_variations' in spec_data:
    new_property = {
        'property_name': 'Case Severity',
        'property_values': ['Critical', 'Severe', 'Moderate', 'Mild'],
        'base_distributions': {
            'Critical': 10,
            'Severe': 25,
            'Moderate': 40,
            'Mild': 25
        },
        'conditional_distributions': {}
    }
    spec_data['data_property_variations'].append(new_property)
    print(f"✓ Added new property: {new_property['property_name']}")

    # Update requirements for medical context
    if 'requirements' in spec_data:
        spec_data['requirements'] += (
            "\n\nGenerated patient cases must maintain medical accuracy "
            "and include appropriate clinical correlations between symptoms, "
            "test results, and diagnoses."
        )
        print(f"✓ Updated requirements for medical context")

# Convert back to YAML
new_config_yaml = yaml.dump(current_config, default_flow_style=False, sort_keys=False)

# Update the spec (creates a new version automatically)
updated_spec = client.dataframer.specs.update(
    spec_id=spec_id,
    config_yaml=new_config_yaml,  # Your edits - THIS is what matters!
    results_yaml=latest_version.results_yaml,  # Historical reference
    orig_results_yaml=latest_version.orig_results_yaml,  # Backup
    runtime_params=latest_version.runtime_params  # Metadata
)

print(f"\n✓ Updated spec. New version: {updated_spec.latest_version}")
✓ Added new property: Case Severity
✓ Updated requirements for medical context
✓ Updated spec. New version: 2

Step 5: Generate Multi-Folder Samples

Once the spec is generated and finalized after any manual modifications, we will use this spec to generate synthetic data. Refer to this document for more details on the sample generation.
import time
import sys

# --- Start generation run ---
generation_result = client.dataframer.generate.create(
    spec_id=spec_id,

    generation_model="claude-sonnet-4-5",
    ## Note: This model can be used to generate evals data but not data to train competing models
    number_of_samples=3,
    sample_type="long",  # Multi-folder requires "long" for proper file relationships

    ## Advanced configuration for long samples
    outline_model="claude-sonnet-4-5",
    ## Note: This model can be used to generate evals data but not data to train competing models
    outline_thinking_budget=2000,

    # enable_revisions=True,
    # max_revision_cycles=2,
    # revision_model="claude-sonnet-4-5",
    # ## Note: This model can be used to generate evals data but not data to train competing models
    # revision_thinking_budget=1500,
)

task_id = generation_result.task_id
run_id = generation_result.run_id

print("Started generation run")
print(f"Task ID: {task_id}")
print(f"Run ID : {run_id}")
print(f"Status : {generation_result.status}")


# --- Polling setup ---
poll_interval = 10
start_time = time.time()

spinner_sleep = 0.1
last_poll_time = 0.0
state = generation_result.status


while True:
    now = time.time()
    elapsed = now - start_time

    if (now - last_poll_time) >= poll_interval or last_poll_time == 0.0:
        status = client.dataframer.generate.retrieve_status(task_id=task_id)
        state = status["status"]
        last_poll_time = now

    show_bar(state, elapsed, poll_interval=poll_interval)

    if state == "COMPLETED":
        sys.stdout.write("\r" + " " * 80 + "\r")
        sys.stdout.flush()

        print("\nGeneration completed successfully! ✅")
        print(f"Run ID   : {run_id}")

        # Detailed run status
        run_status = client.dataframer.runs.status(run_id=run_id)
        print(f"Run state: {run_status['status']}")
        break

    if state == "FAILED":
        sys.stdout.write("\r" + " " * 80 + "\r")
        sys.stdout.flush()

        print("\nGeneration failed. ❌")
        raise RuntimeError(status.get("error", "Unknown error"))

    time.sleep(spinner_sleep)

Started generation run
Task ID: df53ce8b-8102-4b49-b9aa-7a0a2b476fa5
Run ID : 063287b0-d01e-4e14-8fc9-bb0cb3ad3436
Status : ACCEPTED
Generation completed successfully! ✅
Run ID : 063287b0-d01e-4e14-8fc9-bb0cb3ad3436
Run state: SUCCEEDED

Step 6: Evaluate Generated Samples

While Dataframer evaluates each sample as it is generated, it also supports a post-generation evaluation. This API shows how to evaluate the generated dataset. Read the documentation for more details.

import time
import sys

# --- Start evaluation ---
print(f"Creating evaluation for run: {run_id}")

evaluation = client.dataframer.evaluations.create(
    run_id=run_id,
    evaluation_model="claude-sonnet-4-5"
    # Note: This model can be used to generate evals data but not data to train competing models
)

evaluation_id = evaluation.id

print("\nEvaluation created")
print(f"Evaluation ID: {evaluation_id}")
print(f"Created at   : {evaluation.created_at}")

# --- Polling setup ---
poll_interval = 5
spinner_sleep = 0.1
start_time = time.time()
last_poll_time = 0.0
state = evaluation.status

while True:
    now = time.time()
    elapsed = now - start_time

    if (now - last_poll_time) >= poll_interval or last_poll_time == 0.0:
        eval_status = client.dataframer.evaluations.retrieve(evaluation_id=evaluation_id)
        state = eval_status.status
        last_poll_time = now

    show_bar(state, elapsed, poll_interval=poll_interval)

    if state == "COMPLETED":
        sys.stdout.write("\r" + " " * 80 + "\r")
        sys.stdout.flush()

        print("\nEvaluation completed successfully! ✅")
        break

    if state == "FAILED":
        sys.stdout.write("\r" + " " * 80 + "\r")
        sys.stdout.flush()

        print("\nEvaluation failed. ❌")
        if eval_status.error_message:
            print(f"  Error: {eval_status.error_message}")
        break

    time.sleep(spinner_sleep)

Creating evaluation for run: 063287b0-d01e-4e14-8fc9-bb0cb3ad3436 Evaluation created
Evaluation ID: c3c00680-ca98-4611-b0f1-ffe7b1b6a478
Created at : 2025-12-09 23:51:21.646228+00:00
Evaluation completed successfully! ✅

Step 7: Download Generated Folders with Metadata

List Generated Files

This API lists all the files that were present in the generated dataset.
# Get generated files for the run
result = client.dataframer.runs.generated_files.list(run_id=run_id)

print("📁 Generated Files:")
print("=" * 80)
print(f"Run ID: {result.run_id}")
print(f"Total files: {len(result.generated_files)}")
print("=" * 80)

for i, file in enumerate(result.generated_files, 1):
    print(f"\n📄 File {i}:")
    print(f"  Name: {file.name}")
    print(f"  ID: {file.id}")
    print(f"  Status: {file.status}")
    print(f"  Size: {file.size} bytes")
    print(f"  Type: {file.type}")
    if file.status_details:
        print(f"  Details: {file.status_details}")
    if file.generation_model:
        print(f"  Model: {file.generation_model}")
📁 Generated Files: Run ID: 7528a30a-a46a-4042-9b82-8358872d37d9
Total files: 3
📄 File 1: Name: generated_sample_1.txt ID: txt_sample_1 Status: Completed Size: 16733 bytes Type: text/plain Details: Completed successfully in 0 iterations Model: litellm/anthropic/claude-sonnet-4-5-20250929 📄 File 2: Name: generated_sample_2.txt ID: txt_sample_2 Status: Completed Size: 14441 bytes Type: text/plain Details: Completed successfully in 0 iterations Model: litellm/anthropic/claude-sonnet-4-5-20250929 📄 File 3: Name: generated_sample_3.txt ID: txt_sample_3 Status: Completed Size: 18705 bytes Type: text/plain Details: Completed successfully in 0 iterations Model: litellm/anthropic/claude-sonnet-4-5-20250929

Download All Files as ZIP

This API allows you to download all the generated files as a compressed ZIP file.
from pathlib import Path

print(f"📥 Downloading generated files with metadata as ZIP...")

# Download ZIP file from backend
# The ZIP contains:
# - All generated files with folder structure
# - .metadata files with evaluation tags/classifications
# - top_level.metadata with evaluation summary
downloaded_zip = client.dataframer.runs.generated_files.download_all(
    run_id=run_id
)

# Save ZIP file
output_file = Path(f"generated_samples_{run_id}.zip")
output_file.write_bytes(downloaded_zip.read())

print(f"\n✅ Download complete!")
print(f"📦 ZIP file: {output_file.absolute()}")
📥 Downloading generated files with metadata as ZIP… ✅ Download complete!
📦 ZIP file: /content/generated_samples_7528a30a-a46a-4042-9b82-8358872d37d9.zip

Cleanup (Optional)

Uncomment the code to delete the spec and datasets generated in this notebook.
## ⚠️ Warning: This action cannot be undone. All files will be permanently deleted.

## Step 1: Get all specs for this dataset
# all_specs = client.dataframer.specs.list()
# dataset_specs = [spec for spec in all_specs if spec.dataset_name == dataset.name]

# print(f"Found {len(dataset_specs)} spec(s) referencing this dataset")

## Uncomment to delete the spec
## Step 2: Delete all specs that reference this dataset
# for spec in dataset_specs:
    # print(f"  Deleting spec: {spec.name} (ID: {spec.id})")
    # client.dataframer.specs.delete(spec_id=spec.id)
    # print(f"    ✓ Deleted spec {spec.id}")

## Step 3: Delete the dataset
## Note: Cannot delete a dataset that is referenced by any specs.
## Uncomment to delete the dataset after deleting all specs
# client.dataframer.datasets.delete(dataset_id=dataset_id)
# print(f"✓ Deleted dataset {dataset_id}")

print("⚠️\nDeletion is commented out for safety")
print("Uncomment the code above to delete when ready")
⚠️
Deletion is commented out for safety
Uncomment the code above to delete when ready