Quick Start
Mini-Example: Human-GPT Perception of Anger in Reddit Comments
import subprocess
import sys
subprocess.check_call([sys.executable, "-m", "pip", "install", "wordcloud", "-q"])
!pip install git+https://github.com/raymondli-me/haam.git
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import matplotlib.gridspec as gridspec
from matplotlib.patches import Patch
import os
import re
from haam import HAAM
import warnings
warnings.filterwarnings('ignore')
github_url_processed = 'https://raw.githubusercontent.com/raymondli-me/haam/main/data/anger_family_with_angry_word_count.csv'
try:
df_loaded_github = pd.read_csv(github_url_processed)
df_filtered_github = df_loaded_github.copy() # Use a consistent variable name
print(f"✓ Loaded data from GitHub CSV: {df_filtered_github.shape[0]} rows, {df_filtered_github.shape[1]} columns")
except Exception as e:
print(f"Error loading data from GitHub: {e}")
print(f"Please ensure the URL is correct: {github_url_processed}")
df_filtered_github = None # Set to None to prevent further execution if file not found
if df_filtered_github is not None:
# Prepare data for HAAM using the loaded data
# Ensure column names match those in your CSV
if 'angry_word_count' in df_filtered_github.columns and \
'human_sum_score' in df_filtered_github.columns and \
'gpt_sum_score' in df_filtered_github.columns and \
'text' in df_filtered_github.columns:
texts_github = df_filtered_github['text'].values.tolist()
criterion_github = df_filtered_github['angry_word_count'].values.astype(float) # X: angry word count
human_judgment_github = df_filtered_github['human_sum_score'].values.astype(float) # HU: human ratings
ai_judgment_github = df_filtered_github['gpt_sum_score'].values.astype(float) # AI: GPT ratings
haam_github = HAAM(
criterion=criterion_github, # X: The ground truth variable (angry word count in this case)
ai_judgment=ai_judgment_github, # AI: AI system's predictions/ratings
human_judgment=human_judgment_github, # HU: Human ratings/judgments
texts=texts_github, # Raw text data for topic modeling
n_components=200, # Number of principal components to extract
min_cluster_size=10, # HDBSCAN parameter for topic clustering
min_samples=2, # HDBSCAN parameter for core points
umap_n_components=3, # 3D UMAP for topic embedding
standardize=True, # Standardize variables for DML
sample_split_post_lasso=False, # Use full sample for max power
auto_run=True # Run analysis immediately
)
results_github = haam_github.create_comprehensive_pc_analysis(
#pc_indices=specific_pcs, # Which PCs to analyze (commented = use first 15)
k_topics=3, # Number of topics per pole in word clouds
max_words=100, # Maximum words per word cloud
generate_wordclouds=True, # Create word cloud table (like your Colab script)
generate_3d_umap=True, # Create 3D UMAP with PC arrows
umap_arrow_k=1, # Single topic endpoints for arrows (clean visualization)
show_data_counts=True, # Show "HU: n=3" warnings for sparse data
output_dir='haam_wordclouds_final_aligned_sparse_github', # Output directory
display=True # Show plots in notebook/colab
)
Understanding the Output
The analysis produces several key outputs:
Word Clouds: Show which topics are associated with high vs low PC scores, colored by validity:
Dark red: Consensus high (top quartile for X, HU, and AI)
Light red: Any high (top quartile for HU & AI only)
Dark blue: Consensus low (bottom quartile for all)
Light blue: Any low (bottom quartile for HU & AI only)
Grey: Mixed signals
3D UMAP Visualization: Interactive plot showing topic clusters with PC directional arrows
PoMA Analysis: Percentage of accuracy flowing through measured cues vs direct pathways
Model Performance: R² values showing how well PCs predict X, HU, and AI judgments
Simple Usage
For basic analysis without the full visualization pipeline:
from haam import HAAM
import numpy as np
# Your data
criterion = np.array([...]) # Ground truth
ai_judgment = np.array([...]) # AI predictions
human_judgment = np.array([...]) # Human ratings
texts = [...] # Optional: text data
# Run analysis
haam = HAAM(
criterion=criterion,
ai_judgment=ai_judgment,
human_judgment=human_judgment,
texts=texts, # Optional
auto_run=True
)
# View results
print(haam.results['model_summary'])
print(f"Top PCs: {haam.results['top_pcs']}")
# Create main visualization
haam.create_main_visualization()
Key Methods
create_comprehensive_pc_analysis()
: Full analysis with word clouds and UMAPcreate_main_visualization()
: Interactive dashboard of top PCscreate_3d_umap_with_pc_arrows()
: 3D topic space with PC directionscreate_pc_wordclouds()
: Word clouds for specific PCsexport_all_results()
: Save all results to disk