diff --git a/all-bis.snakefile b/all-bis.snakefile index 183f3d58bb98be297d701af471152b53d7fc74a6..9feac62ba7fede70e2856b8651955e648df4f658 100644 --- a/all-bis.snakefile +++ b/all-bis.snakefile @@ -10,8 +10,8 @@ rule all: dsmz='corpora/dsmz/test-3.3.txt', microbes_habitat='ancillaries/Florilege/2019-12-12/PubMed-Habitat-2019-12-12.txt', microbes_phenotype='ancillaries/Florilege/2019-12-12/PubMed-Phenotype-2019-12-12.txt', - index_folder='corpora/microbes-2019/index', - expander_folder='corpora/microbes-2019/expander' + index_folder='corpora/pubmed/index', + expander_folder='corpora/pubmed/expander' #onto_habitat_json='ancillaries/BioNLP-OST+EnovFood-Habitat.json', #onto_phenotype_json='ancillaries/BioNLP-OST+EnovFood-Phenotype.json', #onto_use_json='ancillaries/Use_V2.json' @@ -130,7 +130,7 @@ rule process_dsmz_corpus: """ ## document batches -BATCHES, = glob_wildcards("corpora/microbes-2019/batch/{id}/batch.xml") +BATCHES, = glob_wildcards("corpora/pubmed/batches/{id}/batch.xml") rule process_pubmed_corpus: input: @@ -140,8 +140,8 @@ rule process_pubmed_corpus: phenotype_paths='ancillaries/BioNLP-OST+EnovFood-Phenotype.paths', use_paths='ancillaries/Use_V2.paths' output: - expander_folder=directory("corpora/microbes-2019/expander"), - index_folder=directory("corpora/microbes-2019/index"), + expander_folder=directory("corpora/pubmed/expander"), + index_folder=directory("corpora/pubmed/index"), florilege_Habitat_result="ancillaries/Florilege/2019-12-12/PubMed-Habitat-2019-12-12.txt", florilege_Phenotype_result="ancillaries/Florilege/2019-12-12/PubMed-Phenotype-2019-12-12.txt" shell: """snakemake --verbose \ diff --git a/all.snakefile b/all.snakefile index 531f768256094ca2106760996d8119502bb983f9..2a995087404272ebe9c20573b31d04a15831bf0e 100644 --- a/all.snakefile +++ b/all.snakefile @@ -10,8 +10,8 @@ rule all: dsmz='corpora/dsmz/test-3.3.txt', microbes_habitat='ancillaries/Florilege/2019-12-12/PubMed-Habitat-2019-12-12.txt', microbes_phenotype='ancillaries/Florilege/2019-12-12/PubMed-Phenotype-2019-12-12.txt', - index_folder='corpora/microbes-2019/index', - expander_folder='corpora/microbes-2019/expander' + index_folder='corpora/pubmed/index', + expander_folder='corpora/pubmed/expander' #onto_habitat_json='ancillaries/BioNLP-OST+EnovFood-Habitat.json', #onto_phenotype_json='ancillaries/BioNLP-OST+EnovFood-Phenotype.json', #onto_use_json='ancillaries/Use_V2.json' @@ -140,7 +140,7 @@ rule process_dsmz_corpus: """ ## document batches -BATCHES, = glob_wildcards("corpora/microbes-2019/batch/{id}/batch.xml") +BATCHES, = glob_wildcards("corpora/pubmed/batches/{id}/batch.xml") rule process_pubmed_corpus: input: @@ -150,8 +150,8 @@ rule process_pubmed_corpus: phenotype_paths='ancillaries/BioNLP-OST+EnovFood-Phenotype.paths', use_paths='ancillaries/Use_V2.paths' output: - expander_folder=directory("corpora/microbes-2019/expander"), - index_folder=directory("corpora/microbes-2019/index"), + expander_folder=directory("corpora/pubmed/expander"), + index_folder=directory("corpora/pubmed/index"), florilege_Habitat_result="ancillaries/Florilege/2019-12-12/PubMed-Habitat-2019-12-12.txt", florilege_Phenotype_result="ancillaries/Florilege/2019-12-12/PubMed-Phenotype-2019-12-12.txt" shell: """snakemake --verbose \ diff --git a/config/config.yaml b/config/config.yaml index 9b2c74a186d721fd0ab904669dcbaa3ed420b029..8605298ff260ff73fa036c57ab1688cf4e8406c0 100644 --- a/config/config.yaml +++ b/config/config.yaml @@ -6,6 +6,12 @@ USE: "ancillaries/Use_V2.obo" ONTONAMES: "BioNLP-OST+EnovFood-Habitat BioNLP-OST+EnovFood-Phenotype Use_V2" +NCBI_TAXO_ID: ancillaries/ncbi-taxonomy-prefix/taxa+id_full.txt" + +NCBI_TAXO_MICROORGANISMS: ancillaries/ncbi-taxonomy-prefix/taxa+id_microorganisms.txt" + +MAP_BACDIVE_TAXID: "ancillaries/ncbi-taxonomy-prefix/dsmz-match/bacdive-to-taxid.txt" + ## cirm CIRM_FOLDER_NAME : "cirm" CIRM_CORPUS_HOME : "corpora/cirm/" @@ -32,9 +38,9 @@ DSMZ_HABITAT_CORPUS : "corpora/dsmz/dsmz-data/category=origin-key=sample_type.ts DSMZ_RESULT : "corpora/dsmz/test-3.3.txt" ## pubmed -PUBMED_FOLDER_NAME : "microbes-2019" -PUBMED_CORPUS_HOME : "corpora/microbes-2019" -PUBMED_BATCHES_HOME : "corpora/microbes-2019/batch" +PUBMED_FOLDER_NAME : "pubmed" +PUBMED_CORPUS_HOME : "corpora/pubmed" +PUBMED_BATCHES_HOME : "corpora/pubmed/batch" PUBMED_HABITAT_RESULT : "ancillaries/Florilege/2019-12-12/PubMed-Habitat-2019-12-12.txt" PUBMED_PHENOTYPE_RESULT : "ancillaries/Florilege/2019-12-12/PubMed-Phenotype-2019-12-12.txt" @@ -43,8 +49,8 @@ BIONLPOST_BATCHES_HOME : "corpora/BioNLP-OST-2019/batch" BIONLPOST_API : "http://bibliome.jouy.inra.fr/demo/BioNLP-OST-2019-Evaluation/api" ## index -ALVISIR_INDEX : "corpora/microbes-2019/index" -ALVISIR_EXPANDER : "corpora/microbes-2019/expander" +ALVISIR_INDEX : "corpora/pubmed/index" +ALVISIR_EXPANDER : "corpora/pubmed/expander" ALVISIR_EXPANDER_CONF : "ancillaries/expander.xml" diff --git a/docs/1-preprocess-ontology.md b/docs/1-preprocess-ontology.md index 1b40575d06b5879ee6d334c79680f266df9c69ae..34d08c58ad2e2a22dd1c38894f77674397657594 100644 --- a/docs/1-preprocess-ontology.md +++ b/docs/1-preprocess-ontology.md @@ -56,6 +56,8 @@ The pipeline handles the following resources : * `alvisnlp singularity container` * `conda env` + +<!--> |solution |nb steps | |--------|--------| |AlvisNLP plans | 4 | diff --git a/docs/2-generate-concept-path.md b/docs/2-generate-concept-path.md index 4ccc5dcc731d53da6bac751dc9d5ce41968d207d..1481d3a9764fcec6c6d31e44df9c44554e6aa27a 100644 --- a/docs/2-generate-concept-path.md +++ b/docs/2-generate-concept-path.md @@ -50,6 +50,7 @@ The pipeline handles the following resources : * programs * `alvisnlp singularity container` +<!--> |solution |nb steps | |--------|--------| diff --git a/docs/5-process-dsmz-data.md b/docs/5-process-dsmz-data.md index 012e1285e217be448a1b71962b08ff02fa9127d3..3f87a73a8476e0998cf3345dc599048b2a3cb24f 100644 --- a/docs/5-process-dsmz-data.md +++ b/docs/5-process-dsmz-data.md @@ -55,6 +55,7 @@ The pipeline handles the following resources: * `alvisnlp singularity container` * `python env` +<!--> |solution |nb steps | |--------|--------| |AlvisNLP plans | 2 | diff --git a/docs/6-process-pubmed-data.md b/docs/6-process-pubmed-data.md index 7271d1d36b99994005b26332de58f542b356b499..73fbe848be4d8c235cf274dcfd612900d1262990 100644 --- a/docs/6-process-pubmed-data.md +++ b/docs/6-process-pubmed-data.md @@ -70,6 +70,7 @@ The pipeline handles the following resources : * `python env` +<!-- > |solution |nb steps | |--------|--------| |AlvisNLP plans | 1 | diff --git a/docs/7-process-bionlp-ost.md b/docs/7-process-bionlp-ost.md index f2736d4a8d1b09e4011fbcf8a731e530f57ebe75..edee115173e327e2614aaa38ae64945188069921 100644 --- a/docs/7-process-bionlp-ost.md +++ b/docs/7-process-bionlp-ost.md @@ -7,7 +7,7 @@ The three tasks are: * `BB-rel+ner`: evaluates NER and relation extraction (Lives_In and Exhibits). * `BB-kb+ner`: evaluates knowledge base extraction performance. -The three datasets are available in `corpora/BioNLP-OST-2019/batch`. +The three datasets are available in `corpora/BioNLP-OST-2019/batches`. ## Run the pipeline @@ -66,7 +66,7 @@ The pipeline handles the following resources : * `alvisnlp singularity container` * `python env` - +<!--> |solution |nb steps | |--------|--------| |AlvisNLP plans | xxx | diff --git a/plans/entities.plan b/plans/entities.plan index 9eef3d8facdf29dd45d55a09ce36dd47a105ebf9..071c3c9b5ea27f1ea2e80ba5daff61c16a1ff597 100644 --- a/plans/entities.plan +++ b/plans/entities.plan @@ -305,7 +305,7 @@ <target>documents.sections.layer:taxa</target> <form>@taxid</form> <!--<operator>prefix</operator>--> - <mappingFile>ancillaries/ncbi-taxonomy/taxid_microorganisms.txt</mappingFile> + <mappingFile>ancillaries/ncbi-taxonomy-prefix/taxa+id_microorganisms.txt</mappingFile> <!--<mappingFile>ancillaries/microorganisms.txt</mappingFile>--> <targetFeatures>microorganism</targetFeatures> </taxids> diff --git a/plans/map_microorganisms.plan b/plans/map_microorganisms.plan index db81d24bdf30a6a381d2fdc84ea38a4b94631242..3eb303ec0ae621a66d74ab0075ff9c3cd52b6570 100644 --- a/plans/map_microorganisms.plan +++ b/plans/map_microorganisms.plan @@ -39,7 +39,7 @@ <target>documents.sections.layer:taxa</target> <form>@taxid</form> <!--<operator>prefix</operator>--> - <mappingFile>ancillaries/ncbi-taxonomy/taxid_microorganisms.txt</mappingFile> + <mappingFile>ancillaries/ncbi-taxonomy-prefix/taxa+id_microorganisms.txt</mappingFile> <!--<mappingFile>ancillaries/microorganisms.txt</mappingFile>--> <targetFeatures>microorganism</targetFeatures> </taxids> diff --git a/plans/map_taxid.plan b/plans/map_taxid.plan index e2a9c1991dff4dd2664b2f9755c5b3b16f897dc7..6522b46d4a8eaeeefbc6457cddcf82383f7069d8 100644 --- a/plans/map_taxid.plan +++ b/plans/map_taxid.plan @@ -21,7 +21,7 @@ <taxids class="FileMapper"> <target>documents.sections</target> <form>"ncbi:" ^ str:normalizeSpace(contents)</form> - <mappingFile>ancillaries/ncbi-taxonomy/taxa+id_full.txt</mappingFile> + <mappingFile>ancillaries/ncbi-taxonomy-prefix/taxa+id_full.txt</mappingFile> <keyColumn>1</keyColumn> <targetFeatures>,taxid,canonical-name,path,pos,rank</targetFeatures> </taxids> diff --git a/plans/taxa.plan b/plans/taxa.plan index d724fb2079f0a3231dbe5608487b530c8d02da97..91feb3d75181cbac8a61d1fecb763c4674d3ea88 100644 --- a/plans/taxa.plan +++ b/plans/taxa.plan @@ -52,7 +52,7 @@ <target>documents.sections.layer:taxa</target> <form>@taxid</form> <!--<operator>prefix</operator>--> - <mappingFile>ancillaries/ncbi-taxonomy-prefix/taxid_microorganisms.txt</mappingFile> + <mappingFile>ancillaries/ncbi-taxonomy-prefix/taxa+id_microorganisms.txt</mappingFile> <!--<keyColumn>2</keyColumn>--> <targetFeatures>microorganism</targetFeatures> </taxids> diff --git a/plans/taxa_generic.plan b/plans/taxa_generic.plan index 7e49aea9699f532bc6d7167fc35735d0c907e0f7..7b02dd694722f2fa57d07e60c032a3aef90e5e5d 100644 --- a/plans/taxa_generic.plan +++ b/plans/taxa_generic.plan @@ -1,7 +1,7 @@ <alvisnlp-plan id="taxa"> <dict class="TabularProjector"> <targetLayerName>taxa</targetLayerName> - <dictFile>ancillaries/ncbi-taxonomy/taxa+id_full.txt</dictFile> + <dictFile>ancillaries/ncbi-taxonomy-prefix/taxa+id_full.txt</dictFile> <matchStartCaseInsensitive/> <valueFeatures>,taxid,canonical-name,path,pos,rank,species-taxid,species-name</valueFeatures> <constantAnnotationFeatures>source=NCBI</constantAnnotationFeatures> @@ -48,7 +48,7 @@ <target>documents.sections.layer:taxa</target> <form>@taxid</form> <!--<operator>prefix</operator>--> - <mappingFile>ancillaries/ncbi-taxonomy/taxid_microorganisms.txt</mappingFile> + <mappingFile>ancillaries/ncbi-taxonomy-prefix/taxa+id_microorganisms.txt</mappingFile> <!--<keyColumn>2</keyColumn>--> <targetFeatures>microorganism</targetFeatures> </taxids> diff --git a/process-evaluate_BioNLP-OST.snakefile b/process-evaluate_BioNLP-OST.snakefile index d3dca3e62d5c7d652dc4d19d94025d5b4aee1afc..e12776e94868962a03ba296361b654f9d3de26b9 100644 --- a/process-evaluate_BioNLP-OST.snakefile +++ b/process-evaluate_BioNLP-OST.snakefile @@ -44,7 +44,7 @@ rule run_bionlp_prediction: ontobiotopeUse='ancillaries/Use_V2', plan='plans/entities.plan', dir='corpora/BioNLP-OST-2019/batches/{B}/', - taxid_microorganisms='ancillaries/ncbi-taxonomy-prefix/taxid_microorganisms.txt', + taxid_microorganisms='ancillaries/ncbi-taxonomy-prefix/taxa+id_microorganisms.txt', taxa_id_full='ancillaries/ncbi-taxonomy-prefix/taxa+id_full.txt' singularity:config["SINGULARITY_IMG"] shell:""" diff --git a/process_CIRM_corpus.snakefile b/process_CIRM_corpus.snakefile index 3da26f833a9c8b6513c2dc0191ea88643b934f56..8faebec2a8eda713b5b5dfbbd3d39a1b949f21ba 100644 --- a/process_CIRM_corpus.snakefile +++ b/process_CIRM_corpus.snakefile @@ -80,7 +80,7 @@ rule map_cirm_bia_microorganisms: mapped_taxaids='corpora/cirm/mapped_bia_taxa.txt' params: plan='plans/map_microorganisms.plan', - taxid_microorganisms='ancillaries/ncbi-taxonomy-prefix/taxid_microorganisms.txt', + taxid_microorganisms='ancillaries/ncbi-taxonomy-prefix/taxa+id_microorganisms.txt', taxa_id_full='ancillaries/ncbi-taxonomy-prefix/taxa+id_full.txt' singularity:config["SINGULARITY_IMG"] shell: """alvisnlp -J-Xmx32g -cleanTmp -verbose \ @@ -101,7 +101,7 @@ rule map_cirm_yeast_microorganisms: mapped_taxaids='corpora/cirm/mapped_yeast_taxa.txt' params: plan='plans/map_microorganisms.plan', - taxid_microorganisms='ancillaries/ncbi-taxonomy-prefix/taxid_microorganisms.txt', + taxid_microorganisms='ancillaries/ncbi-taxonomy-prefix/taxa+id_microorganisms.txt', taxa_id_full='ancillaries/ncbi-taxonomy-prefix/taxa+id_full.txt' singularity:config["SINGULARITY_IMG"] shell: """alvisnlp -J-Xmx32g -cleanTmp -verbose \ @@ -123,7 +123,7 @@ rule map_cirm_cfbp_microorganisms: mapped_taxa='corpora/cirm/mapped_cfbp_taxa.txt' params: plan='plans/map_microorganisms.plan', - taxid_microorganisms='ancillaries/ncbi-taxonomy-prefix/taxid_microorganisms.txt', + taxid_microorganisms='ancillaries/ncbi-taxonomy-prefix/taxa+id_microorganisms.txt', taxa_id_full='ancillaries/ncbi-taxonomy-prefix/taxa+id_full.txt' singularity:config["SINGULARITY_IMG"] shell: """alvisnlp -J-Xmx32g -cleanTmp -verbose \ diff --git a/process_PubMed_corpus.snakefile b/process_PubMed_corpus.snakefile index 16c8bef8980c9fcdcf74a1ce5f7e9667582adcbf..4d9455d334140875574ab2c52e54024bbf21d533 100644 --- a/process_PubMed_corpus.snakefile +++ b/process_PubMed_corpus.snakefile @@ -60,7 +60,7 @@ rule run_pubmed_entities: ontobiotopeUse='ancillaries/Use_V2', plan='plans/entities.plan', dir='corpora/pubmed/batches/{B}/', - taxid_microorganisms='ancillaries/ncbi-taxonomy-prefix/taxid_microorganisms.txt', + taxid_microorganisms='ancillaries/ncbi-taxonomy-prefix/taxa+id_microorganisms.txt', taxa_id_full='ancillaries/ncbi-taxonomy-prefix/taxa+id_full.txt' singularity:config["SINGULARITY_IMG"] shell: @@ -132,7 +132,7 @@ create the expander rule create_pubmed_expander: input: expander="ancillaries/expander.xml", - taxa_id_microorganisms="ancillaries/ncbi-taxonomy-prefix/taxid_microorganisms.txt", + taxa_id_microorganisms="ancillaries/ncbi-taxonomy-prefix/taxa+id_microorganisms.txt", onto_habitat="ancillaries/BioNLP-OST+EnovFood-Habitat.obo", onto_phenotype="ancillaries/BioNLP-OST+EnovFood-Phenotype.obo", onto_use="ancillaries/Use_V2.obo"