diff --git a/README.md b/README.md index a63b175f9580a038a4b1025bad321729ba57fe95..61656e4f17eeae844dd61fbbf28c22cad5260aff 100644 --- a/README.md +++ b/README.md @@ -21,8 +21,8 @@ Pan1c/ ``` # Prepare your data This workflow can take chromosome level assemblies as well as contig level assemblies. -**Fasta files need to be compressed** using bgzip2 (included in [PanGeTools](https://forgemia.inra.fr/alexis.mergez/pangetools)). -Records id **must follow this pattern** : `<haplotype name>#<ctg|chr name>`. (`CHM13#chr01` for example). +**Fasta files need to be compressed** using **bgzip2** (included in [PanGeTools](https://forgemia.inra.fr/alexis.mergez/pangetools)). +Records id **must follow this pattern** : `<haplotype name>#<ctg|chr name>`. (`CHM13#chr01` for example where the fasata file is named CHM13.fa.gz). Because of the clustering step, this pattern is only needed for the reference assembly. Input files should be read only to prevent snakemake to mess with them (which seems to happen in some rare cases). @@ -32,6 +32,7 @@ Before running the worflow, some apptainer images needs to be downloaded : apptainer build <your_apptainer_image_directory>/PanGeTools.sif oras://registry.forgemia.inra.fr/alexis.mergez/pangetools/pangetools:latest apptainer build <your_apptainer_image_directory>/pytools.sif oras://registry.forgemia.inra.fr/alexis.mergez/pangetools/pytools:latest apptainer build <your_apptainer_image_directory>/pggb.sif oras://registry.forgemia.inra.fr/alexis.mergez/pangratools/pggb:latest +apptainer build <your_apptainer_image_directory>/snakebox.sif oras://registry.forgemia.inra.fr/alexis.mergez/pangratools/snakebox:latest ``` # Usage diff --git a/Snakefile b/Snakefile index a2eef6fd6651c937ca4de78a651f28e853283e26..c0e1e7b51d683399cf920c05394d3fccbd19d45a 100644 --- a/Snakefile +++ b/Snakefile @@ -8,26 +8,12 @@ nHAP = len(SAMPLES) rule all: input: - "output/pggb.unnormalized.gfa", - "output/aggregatedStats.tsv", + "output/pan1c.pggb."+config['name']+".gfa", + "output/pan1c.pggb."+config['name']+".chromGraph.stats.tsv", "output/figures", - "output/pav" - -rule haplotypes_aggregation: - # Concatenate every haplotype into a single, compressed, fasta - input: - expand("data/haplotypes/{sample}.fa.gz", sample=SAMPLES) - output: - "data/input_haplotypes/aggregated.fa.gz" - threads: workflow.cores - params: - apppath=config["app.path"] - shell: - """ - zcat {input} > $(dirname {output})/aggregated.fa - apptainer run --app bgzip {params.apppath}/PanGeTools.sif \ - -@ {threads} $(dirname {output})/aggregated.fa - """ + "output/pav.matrices", + "logs/pan1c.pggb."+config['name']+".logs.tar.gz", + "output/pan1c.pggb."+config['name']+".time.stats.tsv" rule samtools_index: # Using samtools faidx to index compressed fasta @@ -51,7 +37,7 @@ rule ragtag_scaffolding: refgzi="data/haplotypes/"+config['reference']+".gzi", fa="data/haplotypes/{haplotype}.fa.gz" output: - "data/ragtag_hap/{haplotype}.ragtagged.fa.gz" + "data/hap.ragtagged/{haplotype}.ragtagged.fa.gz" threads: 8 params: apppath=config["app.path"] @@ -67,9 +53,9 @@ rule ragtag_scaffolding: """ checkpoint clustering: - # Reading alignment file to create bins for each chromosome + # Read alignment file to create bins for each chromosome input: - expand('data/ragtag_hap/{haplotype}.ragtagged.fa.gz', haplotype=SAMPLES) + expand('data/hap.ragtagged/{haplotype}.ragtagged.fa.gz', haplotype=SAMPLES) output: directory("data/chrInputs") threads: workflow.cores @@ -78,19 +64,21 @@ checkpoint clustering: shell: """ mkdir -p {output} - apptainer run {params.apppath}/pytools.sif python scripts/inputClustering.py --fasta {input} --output {output} + apptainer run {params.apppath}/pytools.sif python scripts/inputClustering.py \ + --fasta {input} --output {output} for file in {output}/*.fa; do apptainer run --app bgzip {params.apppath}/PanGeTools.sif -@ {threads} $file done """ rule pggb_on_chr: - # Running pggb on a specific chromosome + # Runs pggb on a specific chromosome input: fa="data/chrInputs/{chromosome}.fa.gz", gzi="data/chrInputs/{chromosome}.fa.gz.gzi" output: - "data/chrGraphs/{chromosome}.gfa" + gfa="data/chrGraphs/{chromosome}.gfa" + og="data/chrGraphs/{chromosome}.gfa" threads: 16 params: pggb=config['pggb.params'], @@ -105,11 +93,13 @@ rule pggb_on_chr: -i {input.fa} {params.pggb} -n {nHAP} -t {threads} -T {threads} -o data/chrGraphs/{wildcards.chromosome} 2>&1 | \ tee {log.cmd} echo $(ls data/chrGraphs/{wildcards.chromosome}) - mv data/chrGraphs/{wildcards.chromosome}/*.gfa {output} + mv data/chrGraphs/{wildcards.chromosome}/*.gfa {output.gfa} + apptainer run --app odgi {params.apppath}/PanGeTools.sif \ + build -s -O -t {threads} -P -g {output.gfa} -o {output.og} """ def availGraphs(wildcards): - # Getting the list of output from 'clustering' and retunring the list of corresponding graphs + # Get the list of output from 'clustering' and returns the list of corresponding graphs # This triggers snakemake to run pggb on every chromosomes available checkpoint_output = checkpoints.clustering.get(**wildcards).output[0] return expand( @@ -135,7 +125,7 @@ rule graph_squeeze: input: "data/chrGraphs/graphsList.txt" output: - "output/pggb.unnormalized.gfa" + "output/pan1c.pggb."+config['name']+".gfa" threads: 16 params: apppath=config['app.path'] @@ -151,7 +141,7 @@ rule graph_squeeze: rule graph_stats: # Using odgi to produce stats on every chromosome scale graph input: - availGraphs + "data/chrGraphs/graphsList.txt" output: directory("output/stats") threads: 4 @@ -159,12 +149,17 @@ rule graph_stats: apppath=config['app.path'] run: shell("mkdir {output}") - for graph in input: + # Getting the list of graphs + with open(input[0]) as handle: + graphList = [graph.rstrip("\n") for graph in handle.readlines()] + # Iterating over graphs + for graph in graphList: shell("apptainer run --app odgi {params.apppath}/PanGeTools.sif stats -S -t {threads} -P -i {graph} > {output}/$(basename {graph} .gfa).stats.tsv") rule graph_figs: + # Creating figures using odgi viz input: - availGraphs + "data/chrGraphs/graphsList.txt" output: directory("output/figures") threads: 4 @@ -174,29 +169,81 @@ rule graph_figs: pcov=config['odgi.pcov.params'] run: shell("mkdir {output}") - for graph in input: - shell("apptainer run --app odgi {params.apppath}/PanGeTools.sif viz -i {graph} -o {output}/$(basename {graph} .gfa).1Dviz.png {params.oneDviz}") - shell("apptainer run --app odgi {params.apppath}/PanGeTools.sif viz -i {graph} -o {output}/$(basename {graph} .gfa).pcov.png {params.pcov}") + # Getting the list of graphs + with open(input[0]) as handle: + graphList = [graph.rstrip("\n") for graph in handle.readlines()] + # Iterating over graphs + for graph in graphList: + shell("apptainer run --app odgi {params.apppath}/PanGeTools.sif viz -i {graph} -o {output}/$(basename {graph} .gfa).1Dviz.png {params.oneDviz} -t {threads} -P") + shell("apptainer run --app odgi {params.apppath}/PanGeTools.sif viz -i {graph} -o {output}/$(basename {graph} .gfa).pcov.png {params.pcov} -t {threads} -P") rule aggregate_stats: - # Reading and merging all stats files into a final one + # Reading and merging all stats files into a final one called aggregatedStats.tsv input: "output/stats/" output: - "output/aggregatedStats.tsv" + "output/pan1c.pggb."+config['name']+".chromGraph.stats.tsv" + params: + apppath=config['app.path'] shell: - "python scripts/statsAggregation.py --input {input} --output {output}" + """ + apptainer run {params.apppath}/pytools.sif python scripts/statsAggregation.py \ + --input {input} --output {output} + """ rule get_pav: - # Create PAV matrix readable by panache + # Create PAV matrix readable by panache for a given chromosome scale graph input: - availGraphs + "data/chrGraphs/graphsList.txt" output: - directory("output/pav") - threads: 1 + directory("output/pav.matrices") + threads: 0.25 * workflow.cores params: apppath=config['app.path'] run: shell("mkdir {output}") - for graph in input: - shell("bash scripts/getPanachePAV.sh {graph} data/chrGraphs/$(basename {graph} .gfa) {output}/$(basename {graph} .gfa).pav.matrix.tsv {params.apppath}") + # Getting the list of graphs + with open(input[0]) as handle: + graphList = [graph.rstrip("\n") for graph in handle.readlines()] + # Iterating over graphs + for graph in graphList: + shell("bash scripts/getPanachePAV.sh -g {graph} -d data/chrGraphs/$(basename {graph} .gfa) -o {output}/$(basename {graph} .gfa).pav.matrix.tsv -a {params.apppath} -t {threads}") + +rule pggb_log_compression: + input: + flag="output/pan1c.pggb."+config['name']+".chromGraph.stats.tsv" + output: + tar="logs/pan1c.pggb."+config['name']+".logs.tar.gz", + chrLen="logs/chrInputLen.tsv" + shell: + """ + cd logs/pggb + tar -zcvf ../../{output.tar} * + cd ../.. + + if [ -f {output.chrLen} ]; then + rm {output.chrLen} + fi + + for chrFasta in data/chrInputs/*.fa.gz; do + filename=$(basename $chrFasta .fa.gz) + num_bases=$(zgrep -v '^>' $chrFasta | tr -d '\\n' | wc -c) + echo "$filename\t$num_bases" >> {output.chrLen} + done + """ + +rule ressources_statistics: + input: + tar="logs/pan1c.pggb."+config['name']+".logs.tar.gz", + chrLen="logs/chrInputLen.tsv" + output: + tsv="output/pan1c.pggb."+config['name']+".usage.stats.tsv", + dir=directory("output/pggb.usage.figs") + params: + apppath=config['app.path'] + shell: + """ + mkdir -p {output.dir} + apptainer run {params.apppath}/pytools.sif python scripts/usageStats.py \ + -i {input.tar} -l {input.chrLen} -o {output.tsv} -f {output.dir} + """ \ No newline at end of file diff --git a/config.yaml b/config.yaml index 0ddab08fabc09e2f9ea4df83fc93d1b4713a73ca..be94960a6ee680a62ebbd63caf96e8ad5c35bad8 100644 --- a/config.yaml +++ b/config.yaml @@ -1,3 +1,4 @@ +name: '02H' reference: 'CHM13v2.0.fa.gz' app.path: '/home/amergez/work/apptainer' wfmash.params: '--approx-map' diff --git a/runSnakemake.sh b/runSnakemake.sh index 84f9786e3ed3a0f79bb1490298eba57e593bb633..6f7c9df9381d671f3ca1efd7f8d83125826778f8 100755 --- a/runSnakemake.sh +++ b/runSnakemake.sh @@ -1,8 +1,8 @@ #!/bin/bash #SBATCH -p unlimitq #SBATCH --cpus-per-task=32 -#SBATCH -o <log path>/pggb.1chr.<id>.log -#SBATCH -J pggb.1chr.<id> +#SBATCH -o <log path>.log +#SBATCH -J <jname> #SBATCH --mem=1T #SBATCH --mail-type=BEGIN,END,FAIL #SBATCH --mail-user=<mail> @@ -14,4 +14,4 @@ module load devel/Miniconda/Miniconda3 source ~/.bashrc mamba activate snakemake -snakemake -c $(nproc) \ No newline at end of file +snakemake -c $(nproc) diff --git a/scripts/bin_split.py b/scripts/bin_split.py deleted file mode 100644 index 2cd3999e750e7636b61bc34bfc127d388a6e9cd8..0000000000000000000000000000000000000000 --- a/scripts/bin_split.py +++ /dev/null @@ -1,80 +0,0 @@ -from Bio import SeqIO -import pandas as pd -import argparse -import gzip -import os - -arg_parser = argparse.ArgumentParser(description='Bin splitter') -arg_parser.add_argument( - "--paf", - "-p", - dest = "paf", - required = True, - help = "Paf file" - ) -arg_parser.add_argument( - "--dir", - "-d", - dest = "dir", - required = True, - help = "Output directory" - ) -arg_parser.add_argument( - "--fasta", - "-f", - dest = "fasta", - required = True, - help = "Aggregated fasta" - ) -args = arg_parser.parse_args() - - -df = pd.read_csv( - args.paf, - sep = '\t', - header = None, - usecols = [0, 5, 11], - index_col = False, - names = ["ctg", "chr", "qual"] - ) - -chrList = df.chr.unique() -chrBins = {name:[] for name in chrList} - -# Getting the best mapping chromosome for each contig -for chrm in chrList: - subdf = df[df.chr == chrm] - - for ctg in subdf.ctg.unique(): - ctgdf = subdf[subdf.ctg == ctg].copy() - ctgdf.sort_values( - by = ["qual"], - inplace = True, - ascending= False - ) - - chrBins[ctgdf.iloc[0].chr].append(ctg) - -# Dictionnary linking contig name to its corresponding chromosome name -reverseDict = { - ctg: chrm - for chrm in chrBins.keys() - for ctg in chrBins[chrm] - } - -# Adding sequences to correct chromosome bins -chrSeq = {name:[] for name in chrList} -with gzip.open(args.fasta, "rt") as handle: - sequences = SeqIO.parse(handle, "fasta") - - for record in sequences: - try : - chrSeq[reverseDict[record.id]].append(record) - except : - pass - # The contig has not been mapped to a chromosome (weird) - -# Writing chromosome bin specific fasta files -for outname in chrSeq.keys(): - with open(os.path.join(args.dir, f"{outname}.fa"), "w") as output_handle: - SeqIO.write(chrSeq[outname], output_handle, "fasta") \ No newline at end of file diff --git a/scripts/getPanachePAV.sh b/scripts/getPanachePAV.sh index d3310ae6a7f729733f48f1d44796826e96e93d22..719b4cf2b49c0bac850084b820e91b1d50794fd6 100644 --- a/scripts/getPanachePAV.sh +++ b/scripts/getPanachePAV.sh @@ -1,38 +1,50 @@ #!/bin/bash -# Get a GFA and return a Presence/Absence Variants readable by Panache +# Get a GFA and returns a Presence/Absence Variants matrix readable by Panache # Panache apptainer image : oras://registry.forgemia.inra.fr/alexis.mergez/pangetools/panache:latest -##Â Inputs -# $1 = GFA path -# $2 = Chromosome directory path (path used by pggb to store files like .paf) -# $3 = Output path -# $4 = Apptainer path to PanGeTools +# Initializing arguments +gfa="" # GFA path +appdir="" # Directory containing apptainer images +threads="" # Threads count +chrdir="" # Chromosome directory (directory used by pggb to store step files like .paf, etc...) +output="" # Output pav + +## Getting arguments +while getopts "g:a:t:d:o:" option; do + case "$option" in + g) gfa="$OPTARG";; + a) appdir="$OPTARG";; + t) threads="$OPTARG";; + d) chrdir="$OPTARG";; + o) output="$OPTARG";; + \?) echo "Usage: $0 [-g gfa] [-a appdir] [-t threads] [-d chrdir] [-o output]" >&2 + exit 1;; + esac +done # Getting chromosome name -chrname=$(basename $1 .gfa) +chrname=$(basename ${gfa} .gfa) # Getting paths in chromosome graph -apptainer run --app odgi "$4/PanGeTools.sif" paths -i $1 -L > $2/$chrname.paths.txt +apptainer run --app odgi "${appdir}/PanGeTools.sif" paths -i ${gfa} -L > ${chrdir}/$chrname.paths.txt # Creating bed using odgi untangle -apptainer run --app odgi "$4/PanGeTools.sif" untangle -i $1 -R $2/$chrname.paths.txt | sed '1d' | \ - cut -f 4,5,6 | sort | uniq > $2/$chrname.untangle.multiref.bed +apptainer run --app odgi "${appdir}/PanGeTools.sif" untangle -i ${gfa} -R ${chrdir}/$chrname.paths.txt -t $threads -P | sed '1d' | \ + cut -f 4,5,6 | sort | uniq > ${chrdir}/$chrname.untangle.multiref.bed # Running odgi pav -apptainer run --app odgi "$4/PanGeTools.sif" pav -i $1 -b $2/$chrname.untangle.multiref.bed -M \ - -B 0.5 > $2/$chrname.untangle.multiref.pavs.matrix.tsv +apptainer run --app odgi "${appdir}/PanGeTools.sif" pav -i ${gfa} -b ${chrdir}/$chrname.untangle.multiref.bed -M \ + -B 0.5 -t $threads -P > ${chrdir}/$chrname.untangle.multiref.pavs.matrix.tsv # Adding empty column for Panache awk -F'\t' 'BEGIN {OFS="\t"} NR==1 {print $1, $2, $3, $4, "temp", "temp", $5, $6, $7} NR>1 {print $1, $2, $3, $4, ".", ".", $5, $6, $7}' \ - $2/$chrname.untangle.multiref.pavs.matrix.tsv > $2/temp1.tsv && \ - mv $2/temp1.tsv $2/$chrname.untangle.multiref.pavs.matrix.tsv + ${chrdir}/$chrname.untangle.multiref.pavs.matrix.tsv > ${chrdir}/temp1.tsv && \ + mv ${chrdir}/temp1.tsv ${chrdir}/$chrname.untangle.multiref.pavs.matrix.tsv # Renaming columns for Panache -awk -F'\t' 'BEGIN {FS=OFS="\t"} NR==1 {sub($1, "#Chromosome"); sub($2, "FeatureStart"); sub($3, "FeatureStop"); sub($4, "Sequence_IUPAC_Plus"); sub($5, "SimilarBlocks"); sub($6, "Function")} {print}' \ - $2/$chrname.untangle.multiref.pavs.matrix.tsv > $2/temp2.tsv && \ - mv $2/temp2.tsv $2/$chrname.untangle.multiref.pavs.matrix.tsv +awk -F'\t' 'BEGIN {FS=OFS="\t"} NR==1 {sub(${gfa}, "#Chromosome"); sub(${chrdir}, "FeatureStart"); sub(${output}, "FeatureStop"); sub(${appdir}, "Sequence_IUPAC_Plus"); sub($5, "SimilarBlocks"); sub($6, "Function")} {print}' \ + ${chrdir}/$chrname.untangle.multiref.pavs.matrix.tsv > ${chrdir}/temp2.tsv && \ + mv ${chrdir}/temp2.tsv ${chrdir}/$chrname.untangle.multiref.pavs.matrix.tsv # Moving the pav file to desired output -mv $2/$chrname.untangle.multiref.pavs.matrix.tsv $3 - - +mv ${chrdir}/$chrname.untangle.multiref.pavs.matrix.tsv ${output} diff --git a/scripts/statsAggregation.py b/scripts/statsAggregation.py index 0be68bcc284d8f631783df01d66c4988e565ec66..c60fe940b0403609efcd22ba80fa383cacc21b19 100644 --- a/scripts/statsAggregation.py +++ b/scripts/statsAggregation.py @@ -1,16 +1,23 @@ +""" +Statistics aggregator for Pan1c workflow + +Aggregate chromosome level graph statistics into a single TSV. + +@author: alexis.mergez@inrae.fr +@version: 1.0 +""" + import os -from Bio import SeqIO -import gzip -import pandas as pd import argparse -arg_parser = argparse.ArgumentParser(description='Statistics aggregator') +## Arguments +arg_parser = argparse.ArgumentParser(description='Statistics aggregator for Pan1c workflow') arg_parser.add_argument( "--input", "-i", dest = "inputDir", required = True, - help = "Input directory" + help = "Input directory containing chromosome scale stats" ) arg_parser.add_argument( "--output", @@ -21,6 +28,7 @@ arg_parser.add_argument( ) args = arg_parser.parse_args() +## Main script lines = [] # Iterating over stats files diff --git a/scripts/usageStats.py b/scripts/usageStats.py new file mode 100644 index 0000000000000000000000000000000000000000..d04d2034a5997dbeee21670838aaec0f5fb228cc --- /dev/null +++ b/scripts/usageStats.py @@ -0,0 +1,173 @@ +""" +Usage statistics script for Pan1c workflow + +Given one or more tarball containing time logs of the pggb rule of the workflow, +the script returns an aggregated table as well as figures + +@author: alexis.mergez@inrae.fr +@version: 1.0 +""" + +from Bio import SeqIO +import pandas as pd +import numpy as np +import argparse +import gzip +import tarfile +import os +import matplotlib.pyplot as plt +import seaborn as sns + +## Arguments +arg_parser = argparse.ArgumentParser(description='Pan1c usage stats script') +arg_parser.add_argument( + "--input", + "-i", + nargs="+", + dest = "tarballs", + required = True, + help = "Tarballs containing time logs" + ) +arg_parser.add_argument( + "--chrinputlen", + "-l", + dest = "chrinputlen", + required = True, + help = "TSV containing input sequence length (generated by Pan1c workflow)" + ) +arg_parser.add_argument( + "--output", + "-o", + dest = "output", + required = True, + help = "Output table (.tsv)" + ) +arg_parser.add_argument( + "--figdir", + "-f", + dest = "figdir", + required = True, + help = "Output directory for figures" + ) +arg_parser.add_argument( + "--debug", + "-d", + action="store_true", + dest = "debug", + help = "Show debug" + ) +args = arg_parser.parse_args() + +## Toolbox +def getRegEquation(x, y): + (slope, intercept), ssr, _1, _2, _3 = np.polyfit(x, y, 1, full = True) + ymean = np.mean(y) + sst = np.sum((y - ymean)**2) + r2 = (1 - (ssr/sst))[0] + return f'y = {slope:.2f}x + {intercept:.2f} (R2 : {r2:.2f})' + +## Main script +aggregatedData = {} + +# Iterating through tarballs +for tarball in args.tarballs: + + # Iterating in given tarball files + with tarfile.open(tarball, "r") as tar : + tarfilelist = [f for f in tar.getmembers() if f.isfile() and f.name.endswith(".time.log")] + + for file in tarfilelist: + + # Extracting content into a dataframe + handle = tar.extractfile(file) + _tmpdf = pd.read_csv( + handle, + sep=": ", + header=None, + engine='python' + ) + + if args.debug: print(f"[timeStats::debug] Current filename: {file.name}") + + # Getting Time, CPU and memory stats + time, cpu, memory = _tmpdf.iloc[4][1], int(_tmpdf.iloc[3][1][:-1]), int(_tmpdf.iloc[9][1])/((1024)**2) + chrid = (file.name).split(".")[0] + + # Convert time to pd.timedelta (handling the 2 input types) + try : + time = pd.to_timedelta(time) + except : + time = pd.to_timedelta(f"00:{time}") + + if args.debug: print(f"[timeStats::debug] Chr: {chrid}\tTime: {time}\tCPU: {cpu}%\t memory: {memory}Gb") + + # Adding to aggregatedData + aggregatedData[chrid] = {"time": time, "cpu": cpu, "mem": memory} + +# Reading chrInputLen to get number of bases used to construct chromosomes +chrSeqLength = pd.read_csv( + args.chrinputlen, + sep="\t", + header=None, + index_col=0 +).to_dict()[1] + +if args.debug: print("[timeStats::debug]", chrSeqLength) + +# Adding base counts to aggregatedData +for chromName, chromLength in chrSeqLength.items(): + aggregatedData[chromName]["mbases"] = chromLength/1000000 + +# Creating a dataframe +df = pd.DataFrame(aggregatedData).transpose() +df.time = pd.to_timedelta(df.time) +df.mem = df.mem.astype(float) +df.cpu = df.cpu.astype(int) +df.mbases = df.mbases.astype(int) + +if args.debug: print("[timeStats::debug]", df.dtypes) + +# Saving the dataframe +df.to_csv(args.output, sep='\t') + +# Creating some figures +# Time versus base count +sns.regplot(x=df.mbases, y=df.time.dt.total_seconds(), line_kws={"color":"r","alpha":0.7,"lw":5}) +equation = getRegEquation(x=df.mbases, y=df.time.dt.total_seconds()) +plt.xlabel('Total input sequences length (Mb)') +plt.ylabel('Graph creation time (s)') +plt.annotate(equation, xy=(0.05, 0.95), xycoords='axes fraction', fontsize=12, color='red') +plt.savefig(os.path.join(args.figdir,"TimeVSSeqLength.png")) +plt.close() + +# Memory versus base count +sns.regplot(x=df.mbases, y=df.mem, line_kws={"color":"r","alpha":0.7,"lw":5}) +equation = getRegEquation(x=df.mbases, y=df.mem) +plt.xlabel('Total input sequences length (Mb)') +plt.ylabel('Peak memory usage (GB)') +plt.annotate(equation, xy=(0.05, 0.95), xycoords='axes fraction', fontsize=12, color='red') +plt.savefig(os.path.join(args.figdir,"MemoryVSSeqLength.png")) +plt.close() + +# CPU versus base count +sns.regplot(x=df.mbases, y=df.cpu, line_kws={"color":"r","alpha":0.7,"lw":5}) +equation = getRegEquation(x=df.mbases, y=df.cpu) +plt.xlabel('Total input sequences length (Mb)') +plt.ylabel('CPU usage (%)') +plt.annotate(equation, xy=(0.05, 0.95), xycoords='axes fraction', fontsize=12, color='red') +plt.savefig(os.path.join(args.figdir,"CpuVSSeqLength.png")) +plt.close() + +# Memory versus CPU +sns.regplot(x=df.cpu, y=df.mem, line_kws={"color":"r","alpha":0.7,"lw":5}) +equation = getRegEquation(x=df.cpu, y=df.mem) +plt.xlabel('CPU usage (%)') +plt.ylabel('Peak memory usage (GB)') +plt.annotate(equation, xy=(0.05, 0.95), xycoords='axes fraction', fontsize=12, color='red') +plt.savefig(os.path.join(args.figdir,"MemoryVSCpu.png")) +plt.close() + + + + +