How to derive snakemake wildcards from python objects?

45 views Asked by At

I am learning snakemake to develop genomics pipelines. Since the inputs/outputs will get very diverse very quickly, I thought to spend some extra time understanding the basics of building snakemake script. My goal is to keep the code explicit and extendable by using python objects, but at the same time convert it from python loops to snakemake wildcards but i couldnt find a way to hack it. How to derive snakemake wildcards from python objects ?

The python class:

class Reference:
    def __init__(self, name, species, source, genome_seq, genome_seq_url, transcript_seq, transcript_seq_url, annotation_gtf, annotation_gtf_url, annotation_gff, annotation_gff_url) -> None:
        
        self.name = name
        self.species = species
        self.source = source
        self.genome_seq = genome_seq
        self.genome_seq_url = genome_seq_url
        self.transcript_seq = transcript_seq
        self.transcript_seq_url = transcript_seq_url
        self.annotation_gtf = annotation_gtf
        self.annotation_gtf_url = annotation_gtf_url
        self.annotation_gff = annotation_gff
        self.annotation_gff_url = annotation_gff_url

CSV references file:

name,species,source,genome_seq,genome_seq_url,transcript_seq,transcript_seq_url,annotation_gtf,annotation_gtf_url,annotation_gff,annotation_gff_url
BDGP6_46,FruitFly,Ensembl,Drosophila_melanogaster.BDGP6.46.dna.toplevel.fa.gz,https://ftp.ensembl.org/pub/release-111/fasta/drosophila_melanogaster/dna/Drosophila_melanogaster.BDGP6.46.dna.toplevel.fa.gz,Drosophila_melanogaster.BDGP6.46.cdna.all.fa.gz,https://ftp.ensembl.org/pub/release-111/fasta/drosophila_melanogaster/cdna/Drosophila_melanogaster.BDGP6.46.cdna.all.fa.gz,Drosophila_melanogaster.BDGP6.46.111.gtf.gz,https://ftp.ensembl.org/pub/release-111/gtf/drosophila_melanogaster/Drosophila_melanogaster.BDGP6.46.111.gtf.gz,Drosophila_melanogaster.BDGP6.46.111.gff3.gz,https://ftp.ensembl.org/pub/release-111/gff3/drosophila_melanogaster/Drosophila_melanogaster.BDGP6.46.111.gff3.gz

Smakefile:

def get_references(references_path:str) -> dict:
    refs_table = dict()
    
    with open(references_path, 'r') as file:
        reader = csv.DictReader(file)
        for row in reader:
            ref_data = Reference(
                row['name'],
                row['species'],
                row['source'],
                row['genome_seq'],
                row['genome_seq_url'],
                row['transcript_seq'],
                row['transcript_seq_url'],
                row['annotation_gtf'],
                row['annotation_gtf_url'],
                row['annotation_gff'],
                row['annotation_gff_url']
            )

            refs_table[row['name']] = ref_data

        return refs_table
    
references_table = get_references('references.csv')

rule all:
    input:
        genome_seq     = expand("../resources/references/{ref_name}/{genome_seq}",         zip, 
                                genome_seq=[references_table[ref].genome_seq for ref in references_table.keys()], 
                                ref_name=[references_table[ref].name for ref in references_table.keys()]),

        transcript_seq = expand("../resources/references/{ref_name}/{transcript_seq}", zip, 
                                transcript_seq=[references_table[ref].transcript_seq for ref in references_table],
                                ref_name=[references_table[ref].name for ref in references_table]),

        annotation_gtf = expand("../resources/references/{ref_name}/{annotation_gtf}", zip, 
                                annotation_gtf=[references_table[ref].annotation_gtf for ref in references_table], 
                                ref_name=[references_table[ref].name for ref in references_table]),

        annotation_gff = expand("../resources/references/{ref_name}/{annotation_gff}", zip, 
                                annotation_gff=[references_table[ref].annotation_gff for ref in references_table.keys()], 
                                ref_name=[references_table[ref].name for ref in references_table.keys()]),

Current implemnetation using dynamic rules:

for ref_name, ref in references_table.items(): 

    pathlib.Path(f"../resources/references/{ref_name}/").mkdir(parents=True, exist_ok=True) 
    pathlib.Path(f"../logs/download/refs/").mkdir(parents=True, exist_ok=True) 
    pathlib.Path(f"../times/download/refs/").mkdir(parents=True, exist_ok=True) 

    genome_seq     = f"../resources/references/{ref_name}/{ref.genome_seq}"
    transcript_seq = f"../resources/references/{ref_name}/{ref.transcript_seq}"
    annotation_gtf = f"../resources/references/{ref_name}/{ref.annotation_gtf}"
    annotation_gff = f"../resources/references/{ref_name}/{ref.annotation_gff}"

    log_file  = f"../logs/download/refs/{ref_name}.txt"
    time_file = f"../times/download/refs/{ref_name}.txt"

    genome_seq_url     = ref.genome_seq_url
    transcript_seq_url = ref.transcript_seq_url
    annotation_gtf_url = ref.annotation_gtf_url
    annotation_gff_url = ref.annotation_gff_url

    rule_name = f"download_reference_{ref_name}"

    rule:
        name : rule_name
        output: 
            genome_seq = genome_seq,
            transcript_seq = transcript_seq,
            annotation_gtf = annotation_gtf, 
            annotation_gff = annotation_gff
        params:
            genome_seq_url     = genome_seq_url,
            transcript_seq_url = transcript_seq_url,
            annotation_gtf_url = annotation_gtf_url,
            annotation_gff_url = annotation_gff_url,
        log:
            log_file = log_file
        benchmark:
            time_file
        container:
            "dockers/general_image"
        threads: 
            1
        message:
            "Downloading {params.genome_seq_url} and {params.transcript_seq_url} and {params.annotation_gtf_url} and {params.annotation_gff_url}"
        shell:
            """ 
            wget {params.genome_seq_url} -O {output.genome_seq}           &> {log.log_file}
            wget {params.transcript_seq_url} -O {output.transcript_seq}   &> {log.log_file}
            wget {params.annotation_gtf_url} -O {output.annotation_gtf}   &> {log.log_file}
            wget {params.annotation_gff_url} -O {output.annotation_gff}   &> {log.log_file}
            """
        
1

There are 1 answers

6
KeyboardCat On

By setting up your rule all as you have, it already expects the output from your download rule for all ref_names in your table, so it isn't necessary to generate a new rule for each ref_name.

It'd also be easier to break your download rule up into separate rules - that way it can run in parallel, and you can essentially copy the respective string you used into your expand statements as the expected output for each rule, with one change - it has to be unambiguous which rule produces which output, for example by saving each file type in a different directory, or specifying the extensions if they're guaranteed to be the same for each file type but different across file types (I've done the latter in the example, but if you know your data won't be that pretty you might have to solve this differently).

Finally, in a good deal of rule attributes (such as input and params), you can use functions or lambda expressions to generate what you want to pass to Snakemake in the rule itself. You then pass the wildcards object to the function and are able to access the wildcards you've set within the rule by name.

(For the outputs, logs and benchmarks, meanwhile, since we are outputting to a new file, Snakemake expects the filename to contain all wildcards to make absolutely sure we're not overwriting across executions of the rule where only one of the wildcards is changed.)

Thus, your rules would look something like this:

rule all:
    input:
        genome_seq     = expand("../resources/references/{ref_name}/{genome_seq}",         zip, 
                                genome_seq=[references_table[ref].genome_seq for ref in references_table.keys()], 
                                ref_name=[references_table[ref].name for ref in references_table.keys()]),

        transcript_seq = expand("../resources/references/{ref_name}/{transcript_seq}", zip, 
                                transcript_seq=[references_table[ref].transcript_seq for ref in references_table],
                                ref_name=[references_table[ref].name for ref in references_table]),

        annotation_gtf = expand("../resources/references/{ref_name}/{annotation_gtf}", zip, 
                                annotation_gtf=[references_table[ref].annotation_gtf for ref in references_table], 
                                ref_name=[references_table[ref].name for ref in references_table]),

        annotation_gff = expand("../resources/references/{ref_name}/{annotation_gff}", zip, 
                                annotation_gff=[references_table[ref].annotation_gff for ref in references_table.keys()], 
                                ref_name=[references_table[ref].name for ref in references_table.keys()])

rule download_genome:
    output:
        genome_seq = "../resources/references/{ref_name}/{genome_seq}.dna.toplevel.fa.gz"
    params:
        genome_seq_url = lambda wildcards: references_table[wildcards.ref_name].genome_seq_url
    log:
        log_file = "../logs/download/refs/{ref_name}_{genome_seq}.txt"
    benchmark:
        "../times/download/refs/{ref_name}_{genome_seq}.txt"
    container:
        "dockers/general_image"
    threads:
        1
    message:
        "Downloading {params.genome_seq_url}"
    shell:
        """
        wget {params.genome_seq_url} -O {output.genome_seq}           &> {log.log_file}
        """

rule download_transcript:
    output:
        transcript_seq = "../resources/references/{ref_name}/{transcript_seq}cdna.all.fa.gz"
    params:
        transcript_seq_url = lambda wildcards: references_table[
            wildcards.ref_name].transcript_seq_url
    log:
        log_file = "../logs/download/refs/{ref_name}_{transcript_seq}.txt"
    benchmark:
        "../times/download/refs/{ref_name}_{transcript_seq}.txt"
    container:
        "dockers/general_image"
    threads:
        1
    message:
        "Downloading {params.transcript_seq_url}"
    shell:
        """
        wget {params.transcript_seq_url} -O {output.transcript_seq}           &> {log.log_file}
        """

rule download_annotation_gtf:
    output:
        annotation_gtf = "../resources/references/{ref_name}/{annotation_gtf}.gtf.gz"
    params:
        annotation_gtf_url = lambda wildcards: references_table[
            wildcards.ref_name].annotation_gtf_url
    log:
        log_file = "../logs/download/refs/{ref_name}_{annotation_gtf}.txt"
    benchmark:
        "../times/download/refs/{ref_name}_{annotation_gtf}.txt"
    container:
        "dockers/general_image"
    threads:
        1
    message:
        "Downloading {params.annotation_gtf_url}"
    shell:
        """
        wget {params.annotation_gtf_url} -O {output.annotation_gtf}           &> {log.log_file}
        """

rule download_annotation_gff:
    output:
        annotation_gff = "../resources/references/{ref_name}/{annotation_gff}.gff3.gz"
    params:
        annotation_gff_url = lambda wildcards: references_table[
            wildcards.ref_name].annotation_gff_url
    log:
        log_file = "../logs/download/refs/{ref_name}_{annotation_gff}.txt"
    benchmark:
        "../times/download/refs/{ref_name}_{annotation_gff}.txt"
    container:
        "dockers/general_image"
    threads:
        1
    message:
        "Downloading {params.annotation_gff_url}"
    shell:
        """
        wget {params.annotation_gff_url} -O {output.annotation_gff}           &> {log.log_file}
        """