# Directory paths
DATA_DIR = "/home/debian/data1/breast/data/wes"
REFERENCE_DIR = "/home/debian/data2/genomes/human_GRCh38.p14"
REFERENCE_GENOME = os.path.join(REFERENCE_DIR, "GRCh38_latest_genomic.fna.gz")
samples = ['100B','101B']
rule bwa_index:
input:
REFERENCE_GENOME
output:
multiext(os.path.join(REFERENCE_DIR, "GRCh38_latest_genomic.fna.gz"), ".amb", ".ann", ".bwt", ".pac", ".sa")
log:
"logs/bwa_index.log"
shell:
"""
bwa index {input} &> {log}
"""
rule run_fastp:
input:
r1=os.path.join(DATA_DIR,{SAMPLE},"raw","{SAMPLE}-exome-tumor_{SEQ_ID}_L{LANE}_R1_001.fastq.gz",sample=SAMPLES),
r2=os.path.join(DATA_DIR,{SAMPLE},"raw","{SAMPLE}-exome-tumor_{SEQ_ID}_L{LANE}_R2_001.fastq.gz",sample=SAMPLES)
output:
r1_trimmed="fastp/{SAMPLE}/{SAMPLE}-exome-tumor_{SEQ_ID}_L{LANE}_R1_001.fastq.gz",
r2_trimmed="fastp/{SAMPLE}/{SAMPLE}-exome-tumor_{SEQ_ID}_L{LANE}_R1_001.fastq.gz"
threads: 8
shell:
"""
fastp -i {input.r1} -I {input.r2} -o {output.r1_trimmed} -O {output.r2_trimmed} -w {threads}
"""
my files are like this:
250B-exome-tumor_S36_L001_R1_001.fastq.gz 250B-exome-tumor_S36_L002_R1_001.fastq.gz 250B-exome-tumor_S36_L003_R1_001.fastq.gz 250B-exome-tumor_S36_L004_R1_001.fastq.gz
250B-exome-tumor_S36_L001_R2_001.fastq.gz 250B-exome-tumor_S36_L002_R2_001.fastq.gz 250B-exome-tumor_S36_L003_R2_001.fastq.gz 250B-exome-tumor_S36_L004_R2_001.fastq.gz
And inside the wes directory I have a lot of directories, 100B/raw , 101B/raw , 102B/raw , and so on.
The problem is that the SEQ_ID is something unrelated to the sample number, so i have to find it first.
And that I have a variable number of LANES.
What’s the easiest way to deal with this inside snakemake?