I have a hail matrix table, and when i run mt_filtered.describe()
, i get the following:
----------------------------------------
Global fields:
None
----------------------------------------
Column fields:
's': str
'metadata': struct {
``: int32,
gender_concept_id: int32,
gender: str,
date_of_birth: str,
race_concept_id: int32,
race: str,
ethnicity_concept_id: int32,
ethnicity: str,
sex_at_birth_concept_id: int32,
sex_at_birth: str,
survey_datetime: str,
survey: str,
question_concept_id: int32,
question: str,
answer_concept_id: int32,
answer: str,
survey_version_concept_id: str,
survey_version_name: str
}
----------------------------------------
Row fields:
'locus': locus<GRCh38>
'alleles': array<str>
'filters': set<str>
'a_index': int32
'was_split': bool
'variant_qc': struct {
gq_stats: struct {
mean: float64,
stdev: float64,
min: float64,
max: float64
},
call_rate: float64,
n_called: int64,
n_not_called: int64,
n_filtered: int64,
n_het: int64,
n_non_ref: int64,
het_freq_hwe: float64,
p_value_hwe: float64,
p_value_excess_het: float64
}
'info': struct {
AC: array<int32>,
AF: array<float64>,
AN: int32,
homozygote_count: array<int32>
}
'annotations': struct {
vid: str,
transcript: str,
contig: str,
position: int32,
ref_allele: str,
alt_allele: str,
gvs_all_ac: str,
gvs_all_an: str,
gvs_all_af: str,
gvs_all_sc: str,
gvs_max_af: str,
gvs_max_ac: str,
gvs_max_an: str,
gvs_max_sc: str,
gvs_max_subpop: str,
gvs_afr_ac: str,
gvs_afr_an: str,
gvs_afr_af: str,
gvs_afr_sc: str,
gvs_amr_ac: str,
gvs_amr_an: str,
gvs_amr_af: str,
gvs_amr_sc: str,
gvs_eas_ac: str,
gvs_eas_an: str,
gvs_eas_af: str,
gvs_eas_sc: str,
gvs_eur_ac: str,
gvs_eur_an: str,
gvs_eur_af: str,
gvs_eur_sc: str,
gvs_mid_ac: str,
gvs_mid_an: str,
gvs_mid_af: str,
gvs_mid_sc: str,
gvs_oth_ac: str,
gvs_oth_an: str,
gvs_oth_af: str,
gvs_oth_sc: str,
gvs_sas_ac: str,
gvs_sas_an: str,
gvs_sas_af: str,
gvs_sas_sc: str,
gene_symbol: str,
transcript_source: str,
aa_change: str,
consequence: str,
dna_change_in_transcript: str,
variant_type: str,
exon_number: str,
intron_number: str,
genomic_location: str,
dbsnp_rsid: str,
gene_id: str,
gene_omim_id: str,
is_canonical_transcript: str,
gnomad_all_af: str,
gnomad_all_ac: str,
gnomad_all_an: str,
gnomad_failed_filter: str,
gnomad_max_af: str,
gnomad_max_ac: str,
gnomad_max_an: str,
gnomad_max_subpop: str,
gnomad_afr_ac: str,
gnomad_afr_an: str,
gnomad_afr_af: str,
gnomad_amr_ac: str,
gnomad_amr_an: str,
gnomad_amr_af: str,
gnomad_asj_ac: str,
gnomad_asj_an: str,
gnomad_asj_af: str,
gnomad_eas_ac: str,
gnomad_eas_an: str,
gnomad_eas_af: str,
gnomad_fin_ac: str,
gnomad_fin_an: str,
gnomad_fin_af: str,
gnomad_nfr_ac: str,
gnomad_nfr_an: str,
gnomad_nfr_af: str,
gnomad_sas_ac: str,
gnomad_sas_an: str,
gnomad_sas_af: str,
gnomad_oth_ac: str,
gnomad_oth_an: str,
gnomad_oth_af: str,
revel: str,
splice_ai_acceptor_gain_score: str,
splice_ai_acceptor_gain_distance: str,
splice_ai_acceptor_loss_score: str,
splice_ai_acceptor_loss_distance: str,
splice_ai_donor_gain_score: str,
splice_ai_donor_gain_distance: str,
splice_ai_donor_loss_score: str,
splice_ai_donor_loss_distance: str,
omim_phenotypes_id: str,
omim_phenotypes_name: str,
clinvar_classification: str,
clinvar_last_updated: str,
clinvar_phenotype: str
}
----------------------------------------
Entry fields:
'GT': call
'GQ': int32
'RGQ': int32
'FT': str
'AD': array<int32>
----------------------------------------
Column key: ['s']
Row key: ['locus', 'alleles']
----------------------------------------
I want to create a barplot such that the x-axis is the different genes (mt_filtered.annotations.gene_symbol
), and there will be pairs of bars for each gene. One bar in each pair will represent the count of male samples and the other bar will be the count of female samples. The gender is given in mt_filtered.metadata.gender
.
I tried the following code, but i got some errors:
# Aggregate the counts of male and female for each gene
gender_counts = mt_filtered.group_rows_by(
mt_filtered.annotations.gene_symbol
).aggregate(
male_count=hl.agg.count_where(mt_filtered.metadata.gender == 'Male'),
female_count=hl.agg.count_where(mt_filtered.metadata.gender == 'Female')
)
# Convert to Pandas DataFrame for plotting
gender_df = gender_counts.rows().to_pandas()
# Reshape the DataFrame for seaborn plotting
gender_df = gender_df.melt(id_vars=["gene_symbol"], value_vars=["male_count", "female_count"],
var_name="gender", value_name="count")
# Replace 'male_count' and 'female_count' with 'Male' and 'Female' for better plot labels
gender_df['gender'] = gender_df['gender'].replace({'male_count': 'Male', 'female_count': 'Female'})
# Plot the results
plt.figure(figsize=(12, 6))
sns.barplot(data=gender_df, x='gene_symbol', y='count', hue='gender')
plt.xlabel('Gene')
plt.ylabel('Count')
plt.title('Count of Male and Female Samples for Each Gene')
plt.legend(title='Gender')
plt.show()
I will greatly appreciate any suggestion on how I can get the desired plot. Thanks