diamond_protein_against_ncbi_nr.Rd
Run protein to protein DIAMOND2 of reference sequences against a blast-able database or fasta file.
diamond_protein_against_ncbi_nr(
query,
ncbi_nr_folder_path_path,
make_diamond_db = FALSE,
store_hit_table_in_tmp = TRUE,
store_hit_table_at_path = tempdir(),
task = "blastp",
sensitivity_mode = "fast",
use_arrow_duckdb_connection = TRUE,
evalue = 1e-20,
out_format = "csv",
cores = 1,
max_target_seqs = "unlimited",
hard_mask = TRUE,
diamond_exec_path = NULL,
add_makedb_options = NULL,
add_diamond_options = NULL
)
path to input file in fasta format.
path to the directory that either stores the raw NCBI NR database
with assumed name nr.gz
(requires make_diamond_db = TRUE
) or to the already formatted nr_diamond.dmnd
or nr
database (assumes default: make_diamond_db = FALSE
).
logical specifying whether or not the NCBI NR database at ncbi_nr_folder_path_path
should be formatted with diamond makedb
(make_diamond_db = FALSE
; default).
protein search task option. Options are:
task = "blastp"
: Standard protein-protein comparisons (default).
specify the level of alignment sensitivity. The higher the sensitivity level, the more deep homologs can be found, but at the cost of reduced computational speed.
sensitivity_mode = "faster"
: fastest alignment mode, but least sensitive (default). Designed for finding hits of >70
sensitivity_mode = "default"
: Default mode. Designed for finding hits of >70
sensitivity_mode = "fast"
: fast alignment mode, but least sensitive (default). Designed for finding hits of >70
sensitivity_mode = "mid-sensitive"
: fast alignments between the fast
mode and the sensitive mode in sensitivity.
sensitivity_mode = "sensitive"
: fast alignments, but full sensitivity for hits >40
sensitivity_mode = "more-sensitive"
: more sensitive than the sensitive
mode.
sensitivity_mode = "very-sensitive"
: sensitive alignment mode.
sensitivity_mode = "ultra-sensitive"
: most sensitive alignment mode (sensitivity as high as BLASTP).
shall DIAMOND2 hit output table be transformed to an in-process (big data disk-processing) arrow connection to DuckDB? This is useful when the DIAMOND2 output table to too large to fit into memory. Default is use_arrow_duckdb_connection = FALSE
.
Please consult the Installation Vignette for details.
Expectation value (E) threshold for saving hits (default: evalue = 0.001
).
a character string specifying the format of the file in which the DIAMOND results shall be stored. Available options are:
out_format = "pair"
: Pairwise
out_format = "xml"
: XML
out_format = "csv"
: Comma-separated file
number of cores for parallel DIAMOND searches.
maximum number of aligned sequences that shall be retained. Please be aware that max_target_seqs
selects best hits based on the database entry and not by the best e-value. See details here: https://academic.oup.com/bioinformatics/advance-article/doi/10.1093/bioinformatics/bty833/5106166 .
shall low complexity regions be hard masked with TANTAN? Default is db_hard_mask = TRUE
.
a path to the DIAMOND executable or conda/miniconda
folder.
a character string specifying additional makedb options that shall be passed on to the diamond makedb command line call, e.g. add_make_options = "--taxonnames"
(Default is add_diamond_options = NULL
).
a character string specifying additional diamond options that shall be passed on to the diamond command line call, e.g. add_diamond_options = "--block-size 4.0 --compress 1 --no-self-hits"
(Default is add_diamond_options = NULL
).
if (FALSE) {
# run diamond assuming that the diamond executable is available
# via the system path ('diamond_exec_path = NULL') and using
# sensitivity_mode = "ultra-sensitive"
diamond_example <- diamond_protein_against_ncbi_nr(
query = system.file('seqs/qry_aa.fa', package = 'rdiamond'),
ncbi_nr_folder_path_path = system.file('seqs/sbj_aa.fa', package = 'rdiamond'),
sensitivity_mode = "ultra-sensitive",
use_arrow_duckdb_connection = FALSE)
# look at DIAMOND results
diamond_example
# run diamond assuming that the diamond executable is available
# via the miniconda path ('diamond_exec_path = "/opt/miniconda3/bin/"')
# and using 2 cores as well as sensitivity_mode = "ultra-sensitive"
diamond_example_conda <- diamond_protein_against_ncbi_nr(
query = system.file('seqs/qry_aa.fa', package = 'rdiamond'),
ncbi_nr_folder_path_path = system.file('seqs/sbj_aa.fa', package = 'rdiamond'),
sensitivity_mode = "ultra-sensitive", diamond_exec_path = "/opt/miniconda3/bin/",
use_arrow_duckdb_connection = FALSE, cores = 2)
# look at DIAMOND results
diamond_example_conda
# run diamond assuming that the diamond executable is available
# via the system path ('diamond_exec_path = NULL') and using
# sensitivity_mode = "ultra-sensitive" and adding command line options:
# "--block-size 4.0 --compress 1 --no-self-hits"
diamond_example_ultra_sensitive_add_diamond_options <- diamond_protein_against_ncbi_nr(
query = system.file('seqs/qry_aa.fa', package = 'rdiamond'),
ncbi_nr_folder_path_path = system.file('seqs/sbj_aa.fa', package = 'rdiamond'),
sensitivity_mode = "ultra-sensitive",
max_target_seqs = 500,
use_arrow_duckdb_connection = FALSE,
add_diamond_options = "--block-size 4.0 --compress 1 --no-self-hits",
cores = 1
)
# look at DIAMOND results
diamond_example_ultra_sensitive_add_diamond_options
# run diamond assuming that the diamond executable is available
# via the system path ('diamond_exec_path = NULL') and using
# sensitivity_mode = "ultra-sensitive" and adding makedb command line options:
# "--taxonnames"
diamond_example_ultra_sensitive_add_makedb_options <- diamond_protein_against_ncbi_nr(
query = system.file('seqs/qry_aa.fa', package = 'rdiamond'),
ncbi_nr_folder_path_path = system.file('seqs/sbj_aa.fa', package = 'rdiamond'),
sensitivity_mode = "ultra-sensitive",
max_target_seqs = 500,
use_arrow_duckdb_connection = FALSE,
add_makedb_options = "--taxonnames",
cores = 1
)
# look at DIAMOND results
diamond_example_ultra_sensitive_add_makedb_options
}