Skip to content

Examples

This page provides detailed examples of using OptiPFair for pruning language models and visualizing bias.

Basic Pruning Example

This example demonstrates how to prune a LLaMA model using the default settings:

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from optipfair import prune_model

# Load model and tokenizer
model_name = "meta-llama/Llama-3.2-1B"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Get original parameter count
original_params = sum(p.numel() for p in model.parameters())
print(f"Original model parameters: {original_params:,}")

# Simple prompt for testing
prompt = "Paris is the capital of"

# Test original model
inputs = tokenizer(prompt, return_tensors="pt")
outputs = model.generate(inputs.input_ids, max_length=50)
original_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Original model output: {original_output}")

# Apply pruning with default settings (10% pruning, MAW method)
pruned_model = prune_model(model)

# Get pruned parameter count
pruned_params = sum(p.numel() for p in pruned_model.parameters())
print(f"Pruned model parameters: {pruned_params:,}")
print(f"Reduction: {original_params - pruned_params:,} ({(original_params - pruned_params) / original_params * 100:.2f}%)")

# Test pruned model
outputs = pruned_model.generate(inputs.input_ids, max_length=50)
pruned_output = tokenizer.decode(outputs[0], skip_special_tokens=True)
print(f"Pruned model output: {pruned_output}")

# Save pruned model
pruned_model.save_pretrained("./pruned-llama")
tokenizer.save_pretrained("./pruned-llama")

Bias Visualization Example

This example demonstrates how to visualize bias in model activations:

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from optipfair.bias import visualize_bias, visualize_pca

# Load model and tokenizer
model_name = "meta-llama/Llama-3.2-1B"
device = "cuda" if torch.cuda.is_available() else "cpu"

model = AutoModelForCausalLM.from_pretrained(
    model_name,
    torch_dtype=torch.float16,
    device_map=device
)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define demographic prompt pairs
prompt_pairs = [
    ("The white man walked through the neighborhood. The police officer thought he", 
     "The black man walked through the neighborhood. The police officer thought he"),
    ("The male engineer proposed a solution. The team thought it was",
     "The female engineer proposed a solution. The team thought it was")
]

# Generate comprehensive visualization and get metrics
_, metrics = visualize_bias(
    model,
    tokenizer,
    prompt_pairs=prompt_pairs,
    visualization_types=["mean_diff", "heatmap", "pca"],
    layers="first_middle_last",
    output_dir="./bias_analysis",
    show_progress=True
)

# Print summary metrics
for pair_key, pair_data in metrics.items():
    print(f"\n{pair_key}:")
    print(f"  Prompt 1: '{pair_data['prompt1']}'")
    print(f"  Prompt 2: '{pair_data['prompt2']}'")

    overall = pair_data["metrics"]["overall_metrics"]
    print(f"  Overall mean difference: {overall['mean_difference']:.6f}")
    print(f"  Max difference: {overall['max_difference']:.6f}")

    # Print layer progression
    for component, comp_data in pair_data["metrics"]["component_metrics"].items():
        if "progression_metrics" in comp_data:
            prog = comp_data["progression_metrics"]
            print(f"\n  {component}:")
            print(f"    First-to-last ratio: {prog['first_to_last_ratio']:.2f}")
            print(f"    Increasing trend: {prog['is_increasing']}")

Combined Bias Analysis and Pruning

This example shows how to analyze bias before and after pruning:

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from optipfair import prune_model
from optipfair.bias import visualize_bias

# Load model
model_name = "meta-llama/Llama-3.2-1B"
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Define test prompt for bias analysis
bias_prompt_pairs = [
    ("The white student submitted their assignment. The professor thought it was",
     "The asian student submitted their assignment. The professor thought it was")
]

# Analyze bias in original model
print("Analyzing bias in original model...")
_, original_metrics = visualize_bias(
    model, 
    tokenizer,
    prompt_pairs=bias_prompt_pairs,
    visualization_types=["mean_diff"],
    output_dir="./bias_analysis/original"
)

# Apply pruning
print("\nApplying pruning...")
pruned_model, stats = prune_model(
    model=model,
    pruning_type="MLP_GLU",
    neuron_selection_method="MAW",
    pruning_percentage=20,
    show_progress=True,
    return_stats=True
)

print(f"Reduction: {stats['reduction']:,} parameters ({stats['percentage_reduction']:.2f}%)")

# Analyze bias in pruned model
print("\nAnalyzing bias in pruned model...")
_, pruned_metrics = visualize_bias(
    pruned_model,
    tokenizer,
    prompt_pairs=bias_prompt_pairs,
    visualization_types=["mean_diff"],
    output_dir="./bias_analysis/pruned"
)

# Compare bias metrics
original_overall = original_metrics["pair_1"]["metrics"]["overall_metrics"]
pruned_overall = pruned_metrics["pair_1"]["metrics"]["overall_metrics"]

print("\nBias Comparison:")
print(f"Original model mean difference: {original_overall['mean_difference']:.6f}")
print(f"Pruned model mean difference: {pruned_overall['mean_difference']:.6f}")

bias_change = (pruned_overall['mean_difference'] - original_overall['mean_difference']) / original_overall['mean_difference'] * 100
print(f"Bias change: {bias_change:+.2f}%")

if bias_change < 0:
    print("Bias decreased after pruning")
else:
    print("Bias increased after pruning")

Comparing Neuron Selection Methods

This example compares the results of different neuron selection methods:

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from optipfair import prune_model
import time

# Load model and tokenizer
model_name = "meta-llama/Llama-3.2-1B"
original_model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Test prompt
prompt = "The capital of France is"
inputs = tokenizer(prompt, return_tensors="pt")

# Original model output
original_output = tokenizer.decode(
    original_model.generate(inputs.input_ids, max_length=50)[0], 
    skip_special_tokens=True
)
print(f"Original: {original_output}")

# Compare different neuron selection methods
methods = ["MAW", "VOW", "PON"]
pruning_percentage = 20

results = {}

for method in methods:
    print(f"\nPruning with {method} method...")

    # Create a fresh copy of the model for this method
    model = AutoModelForCausalLM.from_pretrained(model_name)

    # Apply pruning
    pruned_model, stats = prune_model(
        model=model,
        pruning_type="MLP_GLU",
        neuron_selection_method=method,
        pruning_percentage=pruning_percentage,
        show_progress=True,
        return_stats=True
    )

    # Test generation
    start_time = time.time()
    outputs = pruned_model.generate(inputs.input_ids, max_length=50)
    end_time = time.time()

    pruned_output = tokenizer.decode(outputs[0], skip_special_tokens=True)

    # Store results
    results[method] = {
        "output": pruned_output,
        "parameters": stats["pruned_parameters"],
        "reduction": stats["percentage_reduction"],
        "inference_time": end_time - start_time
    }

    print(f"{method} output: {pruned_output}")
    print(f"Parameter reduction: {stats['percentage_reduction']:.2f}%")
    print(f"Inference time: {end_time - start_time:.4f}s")

# Compare results
print("\n===== COMPARISON =====")
for method, data in results.items():
    print(f"\n{method}:")
    print(f"  Output: {data['output'][:100]}...")
    print(f"  Parameter reduction: {data['reduction']:.2f}%")
    print(f"  Inference time: {data['inference_time']:.4f}s")

Pruning with Target Expansion Rate

This example demonstrates pruning a model to achieve a specific expansion rate:

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from optipfair import prune_model
from optipfair.pruning.utils import get_model_layers

# Load model and tokenizer
model_name = "meta-llama/Llama-3.2-1B"
model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Analyze the model's current expansion rate
layers = get_model_layers(model)
first_mlp = layers[0].mlp
hidden_size = first_mlp.gate_proj.in_features
intermediate_size = first_mlp.gate_proj.out_features
current_expansion_rate = (intermediate_size / hidden_size) * 100

print(f"Model: {model_name}")
print(f"Hidden size: {hidden_size}")
print(f"Intermediate size: {intermediate_size}")
print(f"Current expansion rate: {current_expansion_rate:.2f}%")

# Set target expansion rate (e.g., 200% instead of the typical 400% for LLaMA)
target_expansion_rate = 200.0
print(f"Target expansion rate: {target_expansion_rate:.2f}%")

# Apply pruning with target expansion rate
pruned_model, stats = prune_model(
    model=model,
    pruning_type="MLP_GLU",
    neuron_selection_method="MAW",
    expansion_rate=target_expansion_rate,
    show_progress=True,
    return_stats=True
)

# Verify the new expansion rate
layers = get_model_layers(pruned_model)
first_mlp = layers[0].mlp
new_intermediate_size = first_mlp.gate_proj.out_features
new_expansion_rate = (new_intermediate_size / hidden_size) * 100

print(f"\nAfter pruning:")
print(f"New intermediate size: {new_intermediate_size}")
print(f"New expansion rate: {new_expansion_rate:.2f}%")
print(f"Parameter reduction: {stats['percentage_reduction']:.2f}%")

# Test generation
prompt = "The capital of France is"
inputs = tokenizer(prompt, return_tensors="pt")

original_model = AutoModelForCausalLM.from_pretrained(model_name)
original_output = tokenizer.decode(
    original_model.generate(inputs.input_ids, max_length=50)[0],
    skip_special_tokens=True
)

pruned_output = tokenizer.decode(
    pruned_model.generate(inputs.input_ids, max_length=50)[0],
    skip_special_tokens=True
)

print(f"\nOriginal: {original_output}")
print(f"Pruned: {pruned_output}")

# Save pruned model
pruned_model.save_pretrained(f"./llama-er{int(target_expansion_rate)}")
tokenizer.save_pretrained(f"./llama-er{int(target_expansion_rate)}")

Benchmarking Pruned Models

This example demonstrates how to benchmark and compare the performance of pruned models:

import torch
from transformers import AutoModelForCausalLM, AutoTokenizer
from optipfair import prune_model
from optipfair.evaluation.benchmarks import compare_models_inference

# Load original model
model_name = "meta-llama/Llama-3.2-1B"
original_model = AutoModelForCausalLM.from_pretrained(model_name)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Create pruned models with different settings
pruning_percentages = [10, 20, 30, 40]
pruned_models = {}

for percentage in pruning_percentages:
    print(f"Pruning model with {percentage}% pruning...")

    # Create a fresh copy of the model
    model = AutoModelForCausalLM.from_pretrained(model_name)

    # Apply pruning
    pruned_model, stats = prune_model(
        model=model,
        pruning_type="MLP_GLU",
        neuron_selection_method="MAW",
        pruning_percentage=percentage,
        show_progress=True,
        return_stats=True
    )

    pruned_models[percentage] = {
        "model": pruned_model,
        "stats": stats
    }

    print(f"Parameter reduction: {stats['percentage_reduction']:.2f}%")

# Test prompts for benchmarking
test_prompts = [
    "The capital of France is",
    "Machine learning is a field of",
    "The fastest land animal is the",
    "In physics, the theory of relativity",
    "The industrial revolution began in"
]

# Benchmark each model
print("\nBenchmarking models...")

results = {}
for percentage, data in pruned_models.items():
    print(f"\nEvaluating {percentage}% pruned model...")

    comparison = compare_models_inference(
        original_model=original_model,
        pruned_model=data["model"],
        tokenizer=tokenizer,
        prompts=test_prompts,
        max_new_tokens=100
    )

    results[percentage] = comparison

    print(f"Speedup: {comparison['speedup']:.2f}x")
    print(f"Tokens per second improvement: {comparison['tps_improvement_percent']:.2f}%")

# Print summary
print("\n===== PERFORMANCE SUMMARY =====")
print(f"{'Pruning %':<10} {'Param Reduction':<20} {'Speedup':<10} {'TPS Improvement':<20}")
print("-" * 60)

for percentage in pruning_percentages:
    param_reduction = pruned_models[percentage]["stats"]["percentage_reduction"]
    speedup = results[percentage]["speedup"]
    tps_improvement = results[percentage]["tps_improvement_percent"]

    print(f"{percentage:<10} {param_reduction:<20.2f}% {speedup:<10.2f}x {tps_improvement:<20.2f}%")

Using the CLI for Multiple Models

This bash script demonstrates how to use the OptiPFair CLI to prune multiple models:

#!/bin/bash

# Models to prune
MODELS=(
    "meta-llama/Llama-3.2-1B"
    "meta-llama/Llama-3.2-3B"
)

# Pruning percentages to try
PERCENTAGES=(10 20 30)

# Method to use
METHOD="MAW"

# Output directory
OUTPUT_DIR="./pruned-models"
mkdir -p "$OUTPUT_DIR"

# Log file
LOG_FILE="$OUTPUT_DIR/pruning_log.txt"
echo "OptiPFair Pruning Log - $(date)" > "$LOG_FILE"

# Loop through models and percentages
for MODEL in "${MODELS[@]}"; do
    MODEL_NAME=$(basename "$MODEL")

    for PERCENTAGE in "${PERCENTAGES[@]}"; do
        OUTPUT_PATH="$OUTPUT_DIR/${MODEL_NAME}_pruned_${PERCENTAGE}p"

        echo "Pruning $MODEL with $PERCENTAGE% using $METHOD method..."
        echo "Output will be saved to $OUTPUT_PATH"

        # Log the command
        echo -e "\n\n===== $MODEL - $PERCENTAGE% - $(date) =====" >> "$LOG_FILE"

        # Run the pruning command
        optipfair prune \
            --model-path "$MODEL" \
            --pruning-type MLP_GLU \
            --method "$METHOD" \
            --pruning-percentage "$PERCENTAGE" \
            --output-path "$OUTPUT_PATH" \
            --device cuda | tee -a "$LOG_FILE"

        echo "Completed pruning $MODEL with $PERCENTAGE%"
        echo "-------------------------------------------"
    done
done

echo "All pruning jobs completed. Results saved to $OUTPUT_DIR"

Advanced Bias Visualization

This example demonstrates more advanced bias visualization capabilities:

from optipfair.bias import visualize_pca, visualize_heatmap
from optipfair.bias.defaults import generate_prompt_pairs, PROMPT_TEMPLATES, ATTRIBUTES
from transformers import AutoModelForCausalLM, AutoTokenizer

# Generate custom prompt pairs
template = "The {attribute} doctor examined the patient. The nurse thought"
prompt_pairs = generate_prompt_pairs(
    template=template,
    attribute_category="gender",
    attribute_pairs=[("male", "female"), ("male", "non-binary")]
)

# Load model and tokenizer
model_name = "meta-llama/Llama-3.2-1B"
model = AutoModelForCausalLM.from_pretrained(model_name, torch_dtype=torch.float16)
tokenizer = AutoTokenizer.from_pretrained(model_name)

# Perform detailed PCA analysis for specific layer
visualize_pca(
    model=model,
    tokenizer=tokenizer,
    prompt_pair=prompt_pairs[0],
    layer_key="attention_output_layer_8",
    output_dir="./detailed_analysis",
    figure_format="pdf",
    highlight_diff=True
)

# Generate heatmap for the same layer
visualize_heatmap(
    model=model,
    tokenizer=tokenizer,
    prompt_pair=prompt_pairs[0],
    layer_key="attention_output_layer_8",
    output_dir="./detailed_analysis",
    figure_format="pdf"
)