import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
import seaborn as sns
from pathlib import Path
from IPython.display import display, HTML
import warnings
warnings.filterwarnings('ignore')

# Set style
plt.style.use('seaborn-v0_8')
sns.set_palette("husl")

# Load results - mask3 (LoF variants) only
results_dir = Path("/Users/sarahurbut/Desktop/SIG/gene_based_analysis")
canonical_dir = results_dir / "canonical"

print("Loading mask3 (LoF variants only) results...")
mask3_files = sorted(canonical_dir.glob("Mask3_*_significant_canonical.tsv"))
mask3_results = []

for file in mask3_files:
    # Parse filename: mask3_{MAF}_significant_canonical.tsv
    parts = file.stem.replace("_significant_canonical", "").split("_")
    maf = parts[1]   # e.g., "0.01" or "singleton"
    
    df = pd.read_csv(file, sep='\t')
    df['MAF'] = maf
    mask3_results.append(df)

# Combine all mask3 results
mask3_df = pd.concat(mask3_results, ignore_index=True)
print(f"Loaded {len(mask3_df)} significant associations from mask3 across {len(mask3_files)} MAF thresholds")

# Get best result per gene-signature (across MAF thresholds)
best_results = mask3_df.loc[mask3_df.groupby(['SIG', 'SYMBOL'])['LOG10P'].idxmax()].copy()
best_results = best_results.sort_values(['SIG', 'LOG10P'], ascending=[True, False])

print("="*80)
print("GENE-BASED ASSOCIATION RESULTS (mask3: LoF Variants Only)")
print("="*80)

Loading mask3 (LoF variants only) results...
Loaded 59 significant associations from mask3 across 6 MAF thresholds
================================================================================
GENE-BASED ASSOCIATION RESULTS (mask3: LoF Variants Only)
================================================================================

# Calculate summary statistics
n_signatures = 21
n_genes_tested = 18464

# Count unique significant genes per signature
sig_counts = best_results.groupby('SIG').agg({
    'SYMBOL': 'count'
}).reset_index()
sig_counts.columns = ['Signature', 'N_Significant']

# Create full summary for all signatures
all_sigs = pd.DataFrame({'Signature': range(n_signatures)})
summary_by_sig = all_sigs.merge(sig_counts, on='Signature', how='left').fillna(0)
summary_by_sig['N_Significant'] = summary_by_sig['N_Significant'].astype(int)
summary_by_sig['N_Genes_Tested'] = n_genes_tested
summary_by_sig['Perc_Significant'] = (summary_by_sig['N_Significant'] / n_genes_tested) * 100

print(f"\nTotal signatures analyzed: {n_signatures}")
print(f"Total genes tested per signature: {n_genes_tested:,}")
print(f"Total significant genes (mask3): {len(best_results)}")
print(f"Signatures with significant genes: {len(sig_counts)}")

print("\n" + "="*80)
print("SUMMARY BY SIGNATURE")
print("="*80)

# Format summary table
display_df = summary_by_sig.copy()
display_df['N_Genes_Tested'] = display_df['N_Genes_Tested'].apply(lambda x: f"{x:,}")
display_df['N_Significant'] = display_df['N_Significant'].apply(lambda x: f"{x:,}")
display_df['Perc_Significant'] = display_df['Perc_Significant'].apply(lambda x: f"{x:.3f}%")
display_df = display_df[['Signature', 'N_Genes_Tested', 'N_Significant', 'Perc_Significant']]
display_df.columns = ['Signature', 'Genes Tested', 'Significant Genes', '% Significant']

display(display_df)

Total signatures analyzed: 21
Total genes tested per signature: 18,464
Total significant genes (mask3): 19
Signatures with significant genes: 12

================================================================================
SUMMARY BY SIGNATURE
================================================================================

# Load results from all masks for comparison
print("Loading results from all masks (mask1-6)...")
all_mask_results = {}

for mask_num in range(1, 7):
    mask_name = f"Mask{mask_num}"
    mask_files = sorted(canonical_dir.glob(f"{mask_name}_*_significant_canonical.tsv"))
    
    if len(mask_files) > 0:
        mask_data = []
        for file in mask_files:
            parts = file.stem.replace("_significant_canonical", "").split("_")
            maf = parts[1]
            df = pd.read_csv(file, sep='\t')
            df['MAF'] = maf
            mask_data.append(df)
        
        if mask_data:
            mask_df = pd.concat(mask_data, ignore_index=True)
            # Get best result per gene-signature (across MAF thresholds)
            best_mask = mask_df.loc[mask_df.groupby(['SIG', 'SYMBOL'])['LOG10P'].idxmax()].copy()
            all_mask_results[mask_name] = best_mask
            print(f"  {mask_name}: {len(best_mask)} significant gene-signature associations")

print(f"\\nTotal masks with results: {len(all_mask_results)}")

# Compare key discoveries across masks
print("\\n" + "="*80)
print("KEY DISCOVERIES ACROSS MASKS")
print("="*80)

# Check DEFB1 in Signature 20 (healthy signature)
print("\\n1. DEFB1 in Signature 20 (healthy signature):")
defb1_found = []
for mask_name, mask_df in all_mask_results.items():
    defb1_sig20 = mask_df[(mask_df['SYMBOL'] == 'DEFB1') & (mask_df['SIG'] == 20)]
    if len(defb1_sig20) > 0:
        defb1_found.append(mask_name)
        log10p = defb1_sig20['LOG10P'].iloc[0]
        print(f"   {mask_name}: LOG10P = {log10p:.2f}")

if len(defb1_found) == 0:
    print("   Not found in other masks")
else:
    print(f"   Found in {len(defb1_found)} mask(s): {', '.join(defb1_found)}")

# Check cross-signature associations (BRCA2, MSH2, PKD1)
print("\\n2. Cross-signature associations:")
cross_sig_genes = {
    'BRCA2': [6, 16],
    'MSH2': [3, 17],
    'PKD1': [8, 16]
}

for gene, sigs in cross_sig_genes.items():
    print(f"\\n   {gene} (Signatures {sigs[0]} and {sigs[1]}):")
    for mask_name, mask_df in all_mask_results.items():
        gene_sigs = mask_df[mask_df['SYMBOL'] == gene]['SIG'].unique()
        found_sigs = [s for s in sigs if s in gene_sigs]
        if len(found_sigs) > 0:
            print(f"     {mask_name}: Found in Signature(s) {found_sigs}")

# Count unique genes per mask
print("\\n" + "="*80)
print("SUMMARY: Unique Genes per Mask")
print("="*80)
for mask_name, mask_df in sorted(all_mask_results.items()):
    unique_genes = mask_df['SYMBOL'].nunique()
    total_assoc = len(mask_df)
    print(f"{mask_name}: {unique_genes} unique genes, {total_assoc} total associations")

# Find genes that appear in multiple masks (robust discoveries)
print("\\n" + "="*80)
print("ROBUST DISCOVERIES: Genes Found in Multiple Masks")
print("="*80)

all_genes = set()
for mask_df in all_mask_results.values():
    all_genes.update(mask_df['SYMBOL'].unique())

gene_mask_count = {}
for gene in all_genes:
    count = sum(1 for mask_df in all_mask_results.values() if gene in mask_df['SYMBOL'].values)
    if count > 1:
        gene_mask_count[gene] = count

if gene_mask_count:
    robust_genes = sorted(gene_mask_count.items(), key=lambda x: x[1], reverse=True)
    print(f"\\nGenes found in 2+ masks: {len(robust_genes)}")
    for gene, count in robust_genes[:10]:  # Show top 10
        masks_with_gene = [m for m, df in all_mask_results.items() if gene in df['SYMBOL'].values]
        print(f"  {gene}: Found in {count} mask(s) - {', '.join(masks_with_gene)}")
else:
    print("\\nNo genes found in multiple masks")

Loading results from all masks (mask1-6)...
  Mask1: 16 significant gene-signature associations
  Mask2: 15 significant gene-signature associations
  Mask3: 19 significant gene-signature associations
  Mask4: 15 significant gene-signature associations
  Mask5: 19 significant gene-signature associations
  Mask6: 16 significant gene-signature associations
\nTotal masks with results: 6
\n================================================================================
KEY DISCOVERIES ACROSS MASKS
================================================================================
\n1. DEFB1 in Signature 20 (healthy signature):
   Mask1: LOG10P = 6.05
   Mask2: LOG10P = 6.05
   Mask3: LOG10P = 6.05
   Mask4: LOG10P = 6.05
   Mask5: LOG10P = 6.05
   Found in 5 mask(s): Mask1, Mask2, Mask3, Mask4, Mask5
\n2. Cross-signature associations:
\n   BRCA2 (Signatures 6 and 16):
     Mask1: Found in Signature(s) [6, 16]
     Mask2: Found in Signature(s) [6, 16]
     Mask3: Found in Signature(s) [6, 16]
     Mask4: Found in Signature(s) [6, 16]
     Mask5: Found in Signature(s) [6]
\n   MSH2 (Signatures 3 and 17):
     Mask1: Found in Signature(s) [3, 17]
\n   PKD1 (Signatures 8 and 16):
     Mask1: Found in Signature(s) [8, 16]
     Mask2: Found in Signature(s) [8, 16]
     Mask3: Found in Signature(s) [16]
     Mask4: Found in Signature(s) [16]
     Mask5: Found in Signature(s) [16]
     Mask6: Found in Signature(s) [16]
\n================================================================================
SUMMARY: Unique Genes per Mask
================================================================================
Mask1: 13 unique genes, 16 total associations
Mask2: 13 unique genes, 15 total associations
Mask3: 18 unique genes, 19 total associations
Mask4: 14 unique genes, 15 total associations
Mask5: 19 unique genes, 19 total associations
Mask6: 16 unique genes, 16 total associations
\n================================================================================
ROBUST DISCOVERIES: Genes Found in Multiple Masks
================================================================================
\nGenes found in 2+ masks: 16
  LPA: Found in 6 mask(s) - Mask1, Mask2, Mask3, Mask4, Mask5, Mask6
  LDLR: Found in 6 mask(s) - Mask1, Mask2, Mask3, Mask4, Mask5, Mask6
  CDH26: Found in 6 mask(s) - Mask1, Mask2, Mask3, Mask4, Mask5, Mask6
  TTN: Found in 6 mask(s) - Mask1, Mask2, Mask3, Mask4, Mask5, Mask6
  RAD52: Found in 6 mask(s) - Mask1, Mask2, Mask3, Mask4, Mask5, Mask6
  PKD1: Found in 6 mask(s) - Mask1, Mask2, Mask3, Mask4, Mask5, Mask6
  APOB: Found in 6 mask(s) - Mask1, Mask2, Mask3, Mask4, Mask5, Mask6
  C10orf67: Found in 5 mask(s) - Mask1, Mask2, Mask3, Mask4, Mask5
  TET2: Found in 5 mask(s) - Mask1, Mask2, Mask3, Mask4, Mask5
  MIP: Found in 5 mask(s) - Mask2, Mask3, Mask4, Mask5, Mask6

# Visualize cross-mask comparison
fig, axes = plt.subplots(2, 2, figsize=(16, 12))

# Plot 1: Number of significant genes per mask
ax1 = axes[0, 0]
mask_counts = {mask: df['SYMBOL'].nunique() for mask, df in sorted(all_mask_results.items())}
bars = ax1.bar(mask_counts.keys(), mask_counts.values(), color=plt.cm.viridis(np.linspace(0, 1, len(mask_counts))))
ax1.set_xlabel('Mask', fontsize=12)
ax1.set_ylabel('Number of Unique Significant Genes', fontsize=12)
ax1.set_title('Significant Genes per Mask', fontsize=14, fontweight='bold')
ax1.grid(True, alpha=0.3, axis='y')
for bar in bars:
    height = bar.get_height()
    ax1.text(bar.get_x() + bar.get_width()/2., height,
            f'{int(height)}', ha='center', va='bottom', fontsize=10, fontweight='bold')

# Plot 2: Robust discoveries - genes found in multiple masks
ax2 = axes[0, 1]
if gene_mask_count:
    robust_sorted = sorted(gene_mask_count.items(), key=lambda x: x[1], reverse=True)[:15]
    genes_list = [g for g, _ in robust_sorted]
    counts_list = [c for _, c in robust_sorted]
    bars = ax2.barh(genes_list, counts_list, color=plt.cm.plasma(np.linspace(0, 1, len(genes_list))))
    ax2.set_xlabel('Number of Masks', fontsize=12)
    ax2.set_ylabel('Gene', fontsize=12)
    ax2.set_title('Robust Discoveries: Genes Found in Multiple Masks', fontsize=14, fontweight='bold')
    ax2.grid(True, alpha=0.3, axis='x')
    for i, (bar, count) in enumerate(zip(bars, counts_list)):
        ax2.text(count, bar.get_y() + bar.get_height()/2.,
                f'{count}', ha='left', va='center', fontsize=9, fontweight='bold')
else:
    ax2.text(0.5, 0.5, 'No genes found in multiple masks', 
            ha='center', va='center', transform=ax2.transAxes, fontsize=12)
    ax2.set_title('Robust Discoveries', fontsize=14, fontweight='bold')

# Plot 3: Key discoveries across masks - DEFB1 in Sig 20
ax3 = axes[1, 0]
defb1_results = []
for mask_name, mask_df in sorted(all_mask_results.items()):
    defb1_sig20 = mask_df[(mask_df['SYMBOL'] == 'DEFB1') & (mask_df['SIG'] == 20)]
    if len(defb1_sig20) > 0:
        defb1_results.append((mask_name, defb1_sig20['LOG10P'].iloc[0]))
    else:
        defb1_results.append((mask_name, 0))

if any(log10p > 0 for _, log10p in defb1_results):
    masks, log10ps = zip(*defb1_results)
    colors_defb1 = ['green' if p > 0 else 'lightgray' for p in log10ps]
    bars = ax3.bar(masks, log10ps, color=colors_defb1, alpha=0.7, edgecolor='black')
    ax3.axhline(y=5.6, color='r', linestyle='--', alpha=0.5, label='Genome-wide significance')
    ax3.set_xlabel('Mask', fontsize=12)
    ax3.set_ylabel('-log₁₀(P-value)', fontsize=12)
    ax3.set_title('DEFB1 in Signature 20 (Healthy Signature) Across Masks', fontsize=14, fontweight='bold')
    ax3.legend()
    ax3.grid(True, alpha=0.3, axis='y')
    for bar, log10p in zip(bars, log10ps):
        if log10p > 0:
            ax3.text(bar.get_x() + bar.get_width()/2., log10p,
                    f'{log10p:.1f}', ha='center', va='bottom', fontsize=9)
else:
    ax3.text(0.5, 0.5, 'DEFB1 not found in other masks', 
            ha='center', va='center', transform=ax3.transAxes, fontsize=12)
    ax3.set_title('DEFB1 in Signature 20 Across Masks', fontsize=14, fontweight='bold')

# Plot 4: Cross-signature associations across masks
ax4 = axes[1, 1]
cross_sig_data = []
for gene, sigs in cross_sig_genes.items():
    for mask_name, mask_df in sorted(all_mask_results.items()):
        gene_sigs = mask_df[mask_df['SYMBOL'] == gene]['SIG'].unique()
        found_both = all(s in gene_sigs for s in sigs)
        if found_both:
            cross_sig_data.append((gene, mask_name, 'Both'))
        elif any(s in gene_sigs for s in sigs):
            found_sigs = [s for s in sigs if s in gene_sigs]
            cross_sig_data.append((gene, mask_name, f'Sig{found_sigs[0]}'))

if cross_sig_data:
    # Create a summary
    gene_mask_summary = {}
    for gene, mask, status in cross_sig_data:
        key = f"{gene}"
        if key not in gene_mask_summary:
            gene_mask_summary[key] = []
        gene_mask_summary[key].append(mask)
    
    genes_list = list(gene_mask_summary.keys())
    mask_counts_per_gene = [len(masks) for masks in gene_mask_summary.values()]
    
    bars = ax4.barh(genes_list, mask_counts_per_gene, color=plt.cm.coolwarm(np.linspace(0, 1, len(genes_list))))
    ax4.set_xlabel('Number of Masks', fontsize=12)
    ax4.set_ylabel('Gene', fontsize=12)
    ax4.set_title('Cross-Signature Associations Across Masks', fontsize=14, fontweight='bold')
    ax4.grid(True, alpha=0.3, axis='x')
    for bar, count in zip(bars, mask_counts_per_gene):
        ax4.text(count, bar.get_y() + bar.get_height()/2.,
                f'{count}', ha='left', va='center', fontsize=9, fontweight='bold')
else:
    ax4.text(0.5, 0.5, 'No cross-signature associations found', 
            ha='center', va='center', transform=ax4.transAxes, fontsize=12)
    ax4.set_title('Cross-Signature Associations', fontsize=14, fontweight='bold')

plt.tight_layout()
plt.show()

print(f"\\n{'='*80}")
print("SUMMARY: Cross-Mask Robustness")
print(f"{'='*80}")
print(f"Total masks analyzed: {len(all_mask_results)}")
print(f"Genes found in 2+ masks: {len(gene_mask_count) if gene_mask_count else 0}")
if gene_mask_count:
    most_robust = max(gene_mask_count.items(), key=lambda x: x[1])
    print(f"Most robust discovery: {most_robust[0]} (found in {most_robust[1]} masks)")

\n================================================================================
SUMMARY: Cross-Mask Robustness
================================================================================
Total masks analyzed: 6
Genes found in 2+ masks: 16
Most robust discovery: LPA (found in 6 masks)

# Load disease names and cluster assignments
import torch

# Load disease names
disease_names_path = Path("/Users/sarahurbut/aladynoulli2/pyScripts/csv/disease_names.csv")
if disease_names_path.exists():
    disease_names_df = pd.read_csv(disease_names_path)
    disease_names = disease_names_df['x'].astype(str).tolist()
    print(f"Loaded {len(disease_names)} disease names")
else:
    disease_names = None
    print("Warning: disease_names.csv not found")

# Load cluster assignments
clusters_path = Path.home() / "Library" / "CloudStorage" / "Dropbox-Personal" / "data_for_running" / "initial_clusters_400k.pt"

if clusters_path.exists() and disease_names is not None:
    clusters = torch.load(clusters_path, map_location='cpu', weights_only=False)
    if torch.is_tensor(clusters):
        clusters = clusters.detach().cpu().numpy()
    
    print(f"\\nLoaded cluster assignments: shape {clusters.shape}")
    print(f"Cluster assignments map diseases to signatures (clusters)\\n")
    
    # Show diseases for signatures with significant genes
    print("="*80)
    print("DISEASE COMPOSITION BY SIGNATURE (Signatures with Significant Genes)")
    print("="*80)
    
    for sig_num in sorted(best_results['SIG'].unique()):
        sig_genes = best_results[best_results['SIG'] == sig_num]
        sig_disease_indices = np.where(clusters == sig_num)[0]
        
        print(f"\\n{'='*80}")
        print(f"Signature {sig_num}: {len(sig_disease_indices)} diseases")
        print(f"Significant genes: {', '.join(sig_genes['SYMBOL'].tolist())}")
        print(f"{'='*80}")
        
        if len(sig_disease_indices) > 0:
            sig_disease_names = [disease_names[i] for i in sig_disease_indices]
            print(f"\\nDiseases in this signature (showing up to 20):")
            for i, disease_name in enumerate(sig_disease_names[:20]):
                print(f"  {i+1}. {disease_name}")
            if len(sig_disease_names) > 20:
                print(f"  ... and {len(sig_disease_names) - 20} more diseases")
else:
    if not clusters_path.exists():
        print(f"\\nWarning: Cluster file not found at {clusters_path}")
    if disease_names is None:
        print("\\nWarning: Cannot show disease composition without disease names")

Loaded 348 disease names
\nLoaded cluster assignments: shape (348,)
Cluster assignments map diseases to signatures (clusters)\n
================================================================================
DISEASE COMPOSITION BY SIGNATURE (Signatures with Significant Genes)
================================================================================
\n================================================================================
Signature 0: 16 diseases
Significant genes: TTN, EEF1A1
================================================================================
\nDiseases in this signature (showing up to 20):
  1. Rheumatic disease of the heart valves
  2. Mitral valve disease
  3. Aortic valve disease
  4. Disease of tricuspid valve
  5. Other forms of chronic heart disease
  6. Cardiomegaly
  7. Pericarditis
  8. Primary/intrinsic cardiomyopathies
  9. Left bundle branch block
  10. Paroxysmal supraventricular tachycardia
  11. Paroxysmal ventricular tachycardia
  12. Atrial fibrillation and flutter
  13. Congestive heart failure (CHF) NOS
  14. Heart failure NOS
  15. Pleurisy; pleural effusion
  16. Congenital anomalies of great vessels
\n================================================================================
Signature 3: 82 diseases
Significant genes: ADGRG7
================================================================================
\nDiseases in this signature (showing up to 20):
  1. Bacterial enteritis
  2. Viral Enteritis
  3. Viral infection
  4. Malignant neoplasm of kidney, except pelvis
  5. Neoplasm of uncertain behavior
  6. Non-Hodgkins lymphoma
  7. Benign neoplasm of lip, oral cavity, and pharynx
  8. Lipoma
  9. Lipoma of skin and subcutaneous tissue
  10. Other benign neoplasm of connective and other soft tissue
  11. Hemangioma and lymphangioma, any site
  12. Gout
  13. Bipolar
  14. Alcohol-related disorders
  15. Alcoholic liver damage
  16. Sleep disorders
  17. Parkinson's disease
  18. Multiple sclerosis
  19. Epilepsy, recurrent seizures, convulsions
  20. Facial nerve disorders [CN7]
  ... and 62 more diseases
\n================================================================================
Signature 4: 5 diseases
Significant genes: C10orf67
================================================================================
\nDiseases in this signature (showing up to 20):
  1. Septal Deviations/Turbinate Hypertrophy
  2. Nasal polyps
  3. Chronic pharyngitis and nasopharyngitis
  4. Chronic sinusitis
  5. Other upper respiratory disease
\n================================================================================
Signature 5: 7 diseases
Significant genes: LDLR, APOB, LPA, CDH26
================================================================================
\nDiseases in this signature (showing up to 20):
  1. Hypercholesterolemia
  2. Unstable angina (intermediate coronary syndrome)
  3. Myocardial infarction
  4. Angina pectoris
  5. Coronary atherosclerosis
  6. Other chronic ischemic heart disease, unspecified
  7. Other acute and subacute forms of ischemic heart disease
\n================================================================================
Signature 6: 8 diseases
Significant genes: BRCA2
================================================================================
\nDiseases in this signature (showing up to 20):
  1. Cancer of bronchus; lung
  2. Malignant neoplasm, other
  3. Secondary malignant neoplasm
  4. Secondary malignancy of lymph nodes
  5. Secondary malignancy of respiratory organs
  6. Secondary malignant neoplasm of digestive systems
  7. Secondary malignant neoplasm of liver
  8. Secondary malignancy of bone
\n================================================================================
Signature 7: 22 diseases
Significant genes: GNB2
================================================================================
\nDiseases in this signature (showing up to 20):
  1. Hypothyroidism NOS
  2. Hyperlipidemia
  3. Obesity
  4. Major depressive disorder
  5. Anxiety disorder
  6. Sleep apnea
  7. Migraine
  8. Essential hypertension
  9. Asthma
  10. GERD
  11. Irritable Bowel Syndrome
  12. Other chronic nonalcoholic liver disease
  13. Other inflammatory spondylopathies
  14. Spondylosis and allied disorders
  15. Other disorders of soft tissues
  16. Rheumatism, unspecified and fibrositis
  17. Other disorders of bone and cartilage
  18. Osteoporosis NOS
  19. Cervicalgia
  20. Myalgia and myositis unspecified
  ... and 2 more diseases
\n================================================================================
Signature 8: 28 diseases
Significant genes: RNF216
================================================================================
\nDiseases in this signature (showing up to 20):
  1. Cervical intraepithelial neoplasia [CIN] [Cervical dysplasia]
  2. Malignant neoplasm of uterus
  3. Malignant neoplasm of ovary
  4. Uterine leiomyoma
  5. Benign neoplasm of ovary
  6. Pelvic peritoneal adhesions, female (postoperative) (postinfection)
  7. Cervicitis and endocervicitis
  8. Endometriosis
  9. Prolapse of vaginal walls
  10. Uterine/Uterovaginal prolapse
  11. Disorders of uterus, NEC
  12. Noninflammatory disorders of cervix
  13. Noninflammatory disorders of vagina
  14. Endometrial hyperplasia
  15. Polyp of corpus uteri
  16. Mucous polyp of cervix
  17. Hypertrophy of female genital organs
  18. Pain and other symptoms associated with female genital organs
  19. Dyspareunia
  20. Disorders of menstruation and other abnormal bleeding from female genital tract
  ... and 8 more diseases
\n================================================================================
Signature 10: 11 diseases
Significant genes: MIP
================================================================================
\nDiseases in this signature (showing up to 20):
  1. Retinal detachments and defects
  2. Retinal detachment with retinal defect
  3. Other retinal disorders
  4. Macular degeneration (senile) of retina NOS
  5. Retinal vascular changes and abnomalities
  6. Glaucoma
  7. Primary open angle glaucoma
  8. Cataract
  9. Senile cataract
  10. Myopia
  11. Disorders of vitreous body
\n================================================================================
Signature 11: 8 diseases
Significant genes: CLPTM1L
================================================================================
\nDiseases in this signature (showing up to 20):
  1. Hemiplegia
  2. Cerebrovascular disease
  3. Occlusion and stenosis of precerebral arteries
  4. Occlusion of cerebral arteries
  5. Cerebral artery occlusion, with cerebral infarction
  6. Cerebral ischemia
  7. Transient cerebral ischemia
  8. Late effects of cerebrovascular disease
\n================================================================================
Signature 16: 29 diseases
Significant genes: PKD1, TET2, BRCA2
================================================================================
\nDiseases in this signature (showing up to 20):
  1. Gram negative septicemia
  2. Bacterial infection NOS
  3. Staphylococcus infections
  4. Streptococcus infection
  5. E. coli
  6. Candidiasis
  7. Disorders of calcium/phosphorus metabolism
  8. Hyposmolality and/or hyponatremia
  9. Hyperpotassemia
  10. Hypopotassemia
  11. Acidosis
  12. Hypovolemia
  13. Other anemias
  14. Thrombocytopenia
  15. Neutropenia
  16. Orthostatic hypotension
  17. Hypotension NOS
  18. Bacterial pneumonia
  19. Pulmonary collapse; interstitial and compensatory emphysema
  20. Other diseases of respiratory system, NEC
  ... and 9 more diseases
\n================================================================================
Signature 19: 23 diseases
Significant genes: OCA2, RAD52
================================================================================
\nDiseases in this signature (showing up to 20):
  1. Viral warts & HPV
  2. Melanomas of skin
  3. Other non-epithelial cancer of skin
  4. Breast cancer [female]
  5. Malignant neoplasm of female breast
  6. Benign neoplasm of skin
  7. Thyrotoxicosis with or without goiter
  8. Secondary hypothyroidism
  9. Other disorders of eyelids
  10. Ectropion or entropion
  11. Disorders of lacrimal system
  12. Epiphora
  13. Hypertensive chronic kidney disease
  14. Chronic glomerulonephritis, NOS
  15. Renal failure NOS
  16. Chronic renal failure [CKD]
  17. Chronic Kidney Disease, Stage III
  18. Lump or mass in breast
  19. Disorder of skin and subcutaneous tissue NOS
  20. Other hypertrophic and atrophic conditions of skin
  ... and 3 more diseases
\n================================================================================
Signature 20: 0 diseases
Significant genes: DEFB1
================================================================================

# Create visualizations
fig, axes = plt.subplots(2, 1, figsize=(14, 10))

# Plot 1: Manhattan-style plot - significance by signature
ax1 = axes[0]
colors = plt.cm.Set3(np.linspace(0, 1, 21))

# Group by signature and plot
for sig_num in sorted(best_results['SIG'].unique()):
    sig_data = best_results[best_results['SIG'] == sig_num]
    x_pos = sig_num + np.random.normal(0, 0.1, len(sig_data))  # Jitter for visibility
    ax1.scatter(x_pos, sig_data['LOG10P'], 
                alpha=0.7, s=100, color=colors[sig_num], 
                label=f'Sig {sig_num}' if len(sig_data) > 0 else '')
    
    # Label top genes
    top_gene = sig_data.loc[sig_data['LOG10P'].idxmax()]
    if top_gene['LOG10P'] > 5:
        ax1.annotate(top_gene['SYMBOL'], 
                    xy=(sig_num, top_gene['LOG10P']),
                    xytext=(5, 5), textcoords='offset points',
                    fontsize=9, alpha=0.8)

ax1.axhline(y=5.6, color='r', linestyle='--', alpha=0.5, label='Genome-wide significance (p=2.5×10⁻⁶)')
ax1.set_xlabel('Signature', fontsize=12)
ax1.set_ylabel('-log₁₀(P-value)', fontsize=12)
ax1.set_title('Significant Genes by Signature (mask3: LoF Variants)', fontsize=14, fontweight='bold')
ax1.grid(True, alpha=0.3)
ax1.legend(bbox_to_anchor=(1.05, 1), loc='upper left', fontsize=8, ncol=2)
ax1.set_xticks(range(21))

# Plot 2: Bar plot - number of significant genes per signature
ax2 = axes[1]
sig_counts_plot = best_results.groupby('SIG').size().sort_index()
bars = ax2.bar(sig_counts_plot.index, sig_counts_plot.values, 
               color=colors[sig_counts_plot.index], alpha=0.7, edgecolor='black')

# Add value labels on bars
for bar in bars:
    height = bar.get_height()
    if height > 0:
        ax2.text(bar.get_x() + bar.get_width()/2., height,
                f'{int(height)}',
                ha='center', va='bottom', fontsize=10, fontweight='bold')

ax2.set_xlabel('Signature', fontsize=12)
ax2.set_ylabel('Number of Significant Genes', fontsize=12)
ax2.set_title('Number of Significant Genes per Signature', fontsize=14, fontweight='bold')
ax2.set_xticks(range(21))
ax2.grid(True, alpha=0.3, axis='y')

plt.tight_layout()
plt.show()

# Print summary
print(f"\nTotal signatures with significant genes: {len(sig_counts_plot)}")
print(f"Total significant gene-signature associations: {len(best_results)}")
print(f"\nTop signatures by number of significant genes:")
top_sigs = sig_counts_plot.sort_values(ascending=False).head(20)
for sig, count in top_sigs.items():
    genes = best_results[best_results['SIG'] == sig]['SYMBOL'].tolist()
    print(f"  Signature {sig}: {count} gene(s) - {', '.join(genes)}")

Total signatures with significant genes: 12
Total significant gene-signature associations: 19

Top signatures by number of significant genes:
  Signature 5: 4 gene(s) - LDLR, APOB, LPA, CDH26
  Signature 16: 3 gene(s) - PKD1, TET2, BRCA2
  Signature 0: 2 gene(s) - TTN, EEF1A1
  Signature 19: 2 gene(s) - OCA2, RAD52
  Signature 3: 1 gene(s) - ADGRG7
  Signature 4: 1 gene(s) - C10orf67
  Signature 6: 1 gene(s) - BRCA2
  Signature 7: 1 gene(s) - GNB2
  Signature 8: 1 gene(s) - RNF216
  Signature 10: 1 gene(s) - MIP
  Signature 11: 1 gene(s) - CLPTM1L
  Signature 20: 1 gene(s) - DEFB1

# Display all significant genes
print("="*80)
print("SIGNIFICANT GENES (Best MAF Threshold per Gene-Signature)")
print("="*80)

display_cols = ['SIG', 'SYMBOL', 'MAF', 'LOG10P', 'BETA', 'SE', 'CHROM']
display_results = best_results[display_cols].copy()
display_results['LOG10P'] = display_results['LOG10P'].apply(lambda x: f"{x:.2f}")
display_results['BETA'] = display_results['BETA'].apply(lambda x: f"{x:.4f}")
display_results['SE'] = display_results['SE'].apply(lambda x: f"{x:.4f}")
display_results.columns = ['Signature', 'Gene', 'MAF', 'LOG10P', 'Beta', 'SE', 'Chr']

display(display_results)

================================================================================
SIGNIFICANT GENES (Best MAF Threshold per Gene-Signature)
================================================================================

# After loading all mask results (around line 467-491)
# Add this code to export each mask to CSV:

output_dir = Path("/Users/sarahurbut/Desktop/SIG/gene_based_analysis/csv_tables")
output_dir.mkdir(parents=True, exist_ok=True)

print("\n" + "="*80)
print("EXPORTING RESULTS TO CSV FILES")
print("="*80)

for mask_name, mask_df in sorted(all_mask_results.items()):
    # Get best result per gene-signature (across MAF thresholds)
    best_mask = mask_df.loc[mask_df.groupby(['SIG', 'SYMBOL'])['LOG10P'].idxmax()].copy()
    
    # Sort by signature, then by LOG10P (descending)
    best_mask = best_mask.sort_values(['SIG', 'LOG10P'], ascending=[True, False])
    
    # Select columns for publication table
    export_cols = ['SIG', 'SYMBOL', 'MAF', 'LOG10P', 'BETA', 'SE', 'CHROM']
    export_df = best_mask[export_cols].copy()
    
    # Rename columns for clarity
    export_df.columns = ['Signature', 'Gene', 'MAF', 'LOG10P', 'Beta', 'SE', 'Chr']
    
    # Format numeric columns for readability
    export_df['LOG10P'] = export_df['LOG10P'].round(2)
    export_df['Beta'] = export_df['Beta'].round(4)
    export_df['SE'] = export_df['SE'].round(4)
    
    # Save to CSV
    csv_path = output_dir / f"{mask_name}_significant_genes.csv"
    export_df.to_csv(csv_path, index=False)
    
    print(f"Exported {mask_name}: {len(export_df)} gene-signature associations -> {csv_path}")

print(f"\nAll CSV files saved to: {output_dir}")

================================================================================
EXPORTING RESULTS TO CSV FILES
================================================================================
Exported Mask1: 16 gene-signature associations -> /Users/sarahurbut/Desktop/SIG/gene_based_analysis/csv_tables/Mask1_significant_genes.csv
Exported Mask2: 15 gene-signature associations -> /Users/sarahurbut/Desktop/SIG/gene_based_analysis/csv_tables/Mask2_significant_genes.csv
Exported Mask3: 19 gene-signature associations -> /Users/sarahurbut/Desktop/SIG/gene_based_analysis/csv_tables/Mask3_significant_genes.csv
Exported Mask4: 15 gene-signature associations -> /Users/sarahurbut/Desktop/SIG/gene_based_analysis/csv_tables/Mask4_significant_genes.csv
Exported Mask5: 19 gene-signature associations -> /Users/sarahurbut/Desktop/SIG/gene_based_analysis/csv_tables/Mask5_significant_genes.csv
Exported Mask6: 16 gene-signature associations -> /Users/sarahurbut/Desktop/SIG/gene_based_analysis/csv_tables/Mask6_significant_genes.csv

All CSV files saved to: /Users/sarahurbut/Desktop/SIG/gene_based_analysis/csv_tables

import pandas as pd
from pathlib import Path

# Input directory
csv_dir = Path("/Users/sarahurbut/Library/CloudStorage/Dropbox-Personal/Apps/Overleaf/Aladynoulli_Nature/SuppDataFiles/rvas/significant/gene_based_csv")

# Output file
output_file = Path("/Users/sarahurbut/Library/CloudStorage/Dropbox-Personal/Apps/Overleaf/Aladynoulli_Nature/SuppDataFiles/rvas/rvas_tables_latex.tex")

# Mask descriptions for captions
mask_descriptions = {
    "Mask1": "LoF variants only",
    "Mask2": "LoF and DelMis09 variants",
    "Mask3": "LoF, DelMis09, DelMis08 variants",
    "Mask4": "LoF, DelMis09, DelMis08, DelMis07 variants",
    "Mask5": "LoF, DelMis09, DelMis08, DelMis07, DelMis06 variants",
    "Mask6": "LoF, DelMis09, DelMis08, DelMis07, DelMis06, DelMis05 variants (most inclusive)"
}

def format_maf(maf_str):
    """Format MAF value for LaTeX"""
    if maf_str == "singleton":
        return "singleton"
    try:
        maf_val = float(maf_str)
        if maf_val < 0.0001:
            # Use scientific notation for very small values
            return f"${maf_val:.0e}$".replace("e-0", " \\times 10^{-").replace("e-", " \\times 10^{-") + "}"
        else:
            return str(maf_val)
    except:
        return str(maf_str)

def csv_to_latex_table(csv_path, mask_name, table_num):
    """Convert CSV to LaTeX table"""
    df = pd.read_csv(csv_path)
    
    # Format columns
    df['LOG10P'] = df['LOG10P'].round(2)
    df['Beta'] = df['Beta'].round(4)
    df['SE'] = df['SE'].round(4)
    df['MAF'] = df['MAF'].apply(format_maf)
    
    # Generate LaTeX
    latex = []
    latex.append("\\begin{table}[h]")
    latex.append("\\centering")
    latex.append("\\footnotesize")
    latex.append("\\singlespacing")
    latex.append(f"\\caption{{\\textbf{{Significant rare variant associations: {mask_name} ({mask_descriptions.get(mask_name, '')}).}} Gene-based rare variant association results for {mask_name}, showing the best MAF threshold per gene-signature pair. LOG10P = $-\\log_{{10}}$(p-value). Genome-wide significance threshold: p $<$ $2.5 \\times 10^{{-6}}$ (LOG10P $>$ 5.6).}}")
    latex.append(f"\\label{{tab:rvas_{mask_name.lower()}}}")
    latex.append("\\begin{tabular}{ccccccc}")
    latex.append("\\toprule")
    latex.append("\\textbf{Sig} & \\textbf{Gene} & \\textbf{MAF} & \\textbf{LOG10P} & \\textbf{Beta} & \\textbf{SE} & \\textbf{Chr} \\\\")
    latex.append("\\midrule")
    
    # Add rows
    for _, row in df.iterrows():
        sig = int(row['Signature'])
        gene = row['Gene']
        maf = row['MAF']
        log10p = row['LOG10P']
        beta = row['Beta']
        se = row['SE']
        chr_num = int(row['Chr'])
        
        latex.append(f"{sig} & {gene} & {maf} & {log10p:.2f} & {beta:.4f} & {se:.4f} & {chr_num} \\\\")
    
    latex.append("\\bottomrule")
    latex.append("\\end{tabular}")
    latex.append("\\end{table}")
    latex.append("")  # Empty line between tables
    
    return "\n".join(latex)

# Process all mask files
all_tables = []

for mask_num in range(1, 7):
    mask_name = f"Mask{mask_num}"
    csv_path = csv_dir / f"{mask_name}_significant_genes.csv"
    
    if csv_path.exists():
        print(f"Processing {mask_name}...")
        table_latex = csv_to_latex_table(csv_path, mask_name, mask_num)
        all_tables.append(table_latex)
    else:
        print(f"Warning: {csv_path} not found")

# Write all tables to file
with open(output_file, 'w') as f:
    f.write("% RVAS Significant Genes Tables - Generated from CSV files\n")
    f.write("% Place these tables in Extended Data Tables section\n\n")
    f.write("\n".join(all_tables))

print(f"\nGenerated LaTeX tables for {len(all_tables)} masks")
print(f"Output saved to: {output_file}")

Processing Mask1...
Processing Mask2...
Processing Mask3...
Processing Mask4...
Processing Mask5...
Processing Mask6...

Generated LaTeX tables for 6 masks
Output saved to: /Users/sarahurbut/Library/CloudStorage/Dropbox-Personal/Apps/Overleaf/Aladynoulli_Nature/SuppDataFiles/rvas/rvas_tables_latex.tex

# Show genes by signature
for sig_num in sorted(best_results['SIG'].unique()):
    sig_genes = best_results[best_results['SIG'] == sig_num].sort_values('LOG10P', ascending=False)
    
    print("\n" + "="*80)
    print(f"Signature {sig_num}: {len(sig_genes)} significant gene(s)")
    print("="*80)
    
    for _, row in sig_genes.iterrows():
        pval = 10**(-row['LOG10P'])
        print(f"\n  {row['SYMBOL']}:")
        print(f"    P-value: {pval:.2e} (LOG10P = {row['LOG10P']:.2f})")
        print(f"    MAF threshold: {row['MAF']}")
        print(f"    Effect size (Beta): {row['BETA']:.4f} ± {row['SE']:.4f}")
        print(f"    Chromosome: {row['CHROM']}")

================================================================================
Signature 0: 2 significant gene(s)
================================================================================

  TTN:
    P-value: 1.06e-21 (LOG10P = 20.97)
    MAF threshold: 0.0001
    Effect size (Beta): 0.1896 ± 0.0198
    Chromosome: 2

  EEF1A1:
    P-value: 1.28e-06 (LOG10P = 5.89)
    MAF threshold: singleton
    Effect size (Beta): -2.1361 ± 0.4410
    Chromosome: 6

================================================================================
Signature 3: 1 significant gene(s)
================================================================================

  ADGRG7:
    P-value: 2.23e-06 (LOG10P = 5.65)
    MAF threshold: 0.001
    Effect size (Beta): -0.1780 ± 0.0376
    Chromosome: 3

================================================================================
Signature 4: 1 significant gene(s)
================================================================================

  C10orf67:
    P-value: 1.16e-06 (LOG10P = 5.93)
    MAF threshold: singleton
    Effect size (Beta): 3.4043 ± 0.7002
    Chromosome: 10

================================================================================
Signature 5: 4 significant gene(s)
================================================================================

  LDLR:
    P-value: 1.04e-33 (LOG10P = 32.98)
    MAF threshold: 0.001
    Effect size (Beta): 0.4656 ± 0.0385
    Chromosome: 19

  APOB:
    P-value: 5.28e-09 (LOG10P = 8.28)
    MAF threshold: 0.0001
    Effect size (Beta): -0.2520 ± 0.0432
    Chromosome: 2

  LPA:
    P-value: 7.18e-08 (LOG10P = 7.14)
    MAF threshold: 0.1
    Effect size (Beta): -0.0325 ± 0.0060
    Chromosome: 6

  CDH26:
    P-value: 7.73e-07 (LOG10P = 6.11)
    MAF threshold: 0.001
    Effect size (Beta): -0.1283 ± 0.0260
    Chromosome: 20

================================================================================
Signature 6: 1 significant gene(s)
================================================================================

  BRCA2:
    P-value: 7.16e-10 (LOG10P = 9.14)
    MAF threshold: 0.0001
    Effect size (Beta): 0.1920 ± 0.0312
    Chromosome: 13

================================================================================
Signature 7: 1 significant gene(s)
================================================================================

  GNB2:
    P-value: 1.44e-06 (LOG10P = 5.84)
    MAF threshold: 0.0001
    Effect size (Beta): 0.8585 ± 0.1781
    Chromosome: 7

================================================================================
Signature 8: 1 significant gene(s)
================================================================================

  RNF216:
    P-value: 5.23e-07 (LOG10P = 6.28)
    MAF threshold: 0.0001
    Effect size (Beta): -0.3578 ± 0.0713
    Chromosome: 7

================================================================================
Signature 10: 1 significant gene(s)
================================================================================

  MIP:
    P-value: 5.05e-07 (LOG10P = 6.30)
    MAF threshold: 0.0001
    Effect size (Beta): 0.3502 ± 0.0697
    Chromosome: 12

================================================================================
Signature 11: 1 significant gene(s)
================================================================================

  CLPTM1L:
    P-value: 5.87e-07 (LOG10P = 6.23)
    MAF threshold: 0.0001
    Effect size (Beta): 0.4448 ± 0.0890
    Chromosome: 5

================================================================================
Signature 16: 3 significant gene(s)
================================================================================

  PKD1:
    P-value: 2.34e-12 (LOG10P = 11.63)
    MAF threshold: 1e-05
    Effect size (Beta): 0.6068 ± 0.0865
    Chromosome: 16

  TET2:
    P-value: 1.88e-07 (LOG10P = 6.73)
    MAF threshold: singleton
    Effect size (Beta): 0.4271 ± 0.0820
    Chromosome: 4

  BRCA2:
    P-value: 2.39e-06 (LOG10P = 5.62)
    MAF threshold: 0.0001
    Effect size (Beta): 0.1481 ± 0.0314
    Chromosome: 13

================================================================================
Signature 19: 2 significant gene(s)
================================================================================

  OCA2:
    P-value: 3.58e-07 (LOG10P = 6.45)
    MAF threshold: 0.01
    Effect size (Beta): 0.1232 ± 0.0242
    Chromosome: 15

  RAD52:
    P-value: 1.01e-06 (LOG10P = 5.99)
    MAF threshold: 0.1
    Effect size (Beta): -0.0378 ± 0.0077
    Chromosome: 12

================================================================================
Signature 20: 1 significant gene(s)
================================================================================

  DEFB1:
    P-value: 8.96e-07 (LOG10P = 6.05)
    MAF threshold: 0.0001
    Effect size (Beta): 0.7003 ± 0.1425
    Chromosome: 8

R1: Genetic Validation - Gene-Based Rare Variant Association Studies (RVAS)¶

Reviewer Question¶

Why This Matters¶

Our Approach¶

Key Findings¶

1. Summary Statistics¶

4. Disease Composition by Signature¶

4. Cross-Mask Comparison¶

5. Visualization¶

2. Significant Genes¶

3. Significant Genes by Signature¶

5. Methods¶

Gene-Based Association Testing¶

References¶

	Signature	Genes Tested	Significant Genes	% Significant
0	0	18,464	2	0.011%
1	1	18,464	0	0.000%
2	2	18,464	0	0.000%
3	3	18,464	1	0.005%
4	4	18,464	1	0.005%
5	5	18,464	4	0.022%
6	6	18,464	1	0.005%
7	7	18,464	1	0.005%
8	8	18,464	1	0.005%
9	9	18,464	0	0.000%
10	10	18,464	1	0.005%
11	11	18,464	1	0.005%
12	12	18,464	0	0.000%
13	13	18,464	0	0.000%
14	14	18,464	0	0.000%
15	15	18,464	0	0.000%
16	16	18,464	3	0.016%
17	17	18,464	0	0.000%
18	18	18,464	0	0.000%
19	19	18,464	2	0.011%
20	20	18,464	1	0.005%

	Signature	Gene	MAF	LOG10P	Beta	SE	Chr
0	0	TTN	0.0001	20.97	0.1896	0.0198	2
54	0	EEF1A1	singleton	5.89	-2.1361	0.4410	6
13	3	ADGRG7	0.001	5.65	-0.1780	0.0376	3
55	4	C10orf67	singleton	5.93	3.4043	0.7002	10
15	5	LDLR	0.001	32.98	0.4656	0.0385	19
1	5	APOB	0.0001	8.28	-0.2520	0.0432	2
38	5	LPA	0.1	7.14	-0.0325	0.0060	6
16	5	CDH26	0.001	6.11	-0.1283	0.0260	20
3	6	BRCA2	0.0001	9.14	0.1920	0.0312	13
4	7	GNB2	0.0001	5.84	0.8585	0.1781	7
5	8	RNF216	0.0001	6.28	-0.3578	0.0713	7
6	10	MIP	0.0001	6.30	0.3502	0.0697	12
7	11	CLPTM1L	0.0001	6.23	0.4448	0.0890	5
52	16	PKD1	1e-05	11.63	0.6068	0.0865	16
57	16	TET2	singleton	6.73	0.4271	0.0820	4
9	16	BRCA2	0.0001	5.62	0.1481	0.0314	13
33	19	OCA2	0.01	6.45	0.1232	0.0242	15
45	19	RAD52	0.1	5.99	-0.0378	0.0077	12
11	20	DEFB1	0.0001	6.05	0.7003	0.1425	8