====== BLASR ======
BLASR (Basic Local Alignment with Successive Refinement) rapidly maps reads to genomes by finding the highest scoring local alignment or set of local alignments between the read and the genome.
\\
Optimized for PacBio's extraordinarily long reads and taking advantage of rich quality values, BLASR maps reads rapidly with high accuracy.
* Install
apt-get install libhdf5-serial-dev
wget https://codeload.github.com/PacificBiosciences/blasr/zip/master
mv master blasr-master.zip
unzip blasr-master.zip
mv blasr-master /usr/local/
cd /usr/local/blasr-master/
Edit alignment/Makefile to solve compiling error due to -lpthread flag position:
#
# common.mk contains the configuration for this build setup.
#
#
ifeq ($(origin PREFIX), undefined)
ifeq ($(origin SEYMOUR_HOME), undefined)
PREFIX = /opt/seymour
else
PREFIX = $(SEYMOUR_HOME)
endif
endif
ANALYSIS_HOME=$(PREFIX)/analysis
PBCPP_DIR = ..
include ../common.mk
#
# Define the targets before including the rules since the rules contains a target itself.
#
EXECS = wordCounter printReadWordCount blasr sdpMatcher swMatcher kbandMatcher sawriter saquery samodify printTupleCountTable cmpPrintTupleCountTable malign removeAdapters tabulateAlignment samatcher saprinter buildQualityValueProfile guidedalign extendAlign sals pbmask
# DISABLE for now
#cmpMatcher
all: bin make.dep $(EXECS)
BUILTEXECS = $(addprefix bin/, $(EXECS))
DISTRIB_SET = blasr swMatcher kbandMatcher sawriter samodify printTupleCountTable cmpPrintTupleCountTable removeAdapters sdpMatcher pbmask
DISTRIB_EXECS = $(addprefix bin/, $(DISTRIB_SET))
INSTALL_EXECS = $(addprefix install-, $(DISTRIB_SET))
include ../make.rules
ifneq ($(wildcard make.dep),)
include make.dep
endif
wordCounter: bin/wordCounter
printReadWordCount: bin/printReadWordCount
blasr: bin/blasr
cmpMatcher: bin/cmpMatcher
sdpMatcher: bin/sdpMatcher
samatcher: bin/samatcher
saprinter: bin/saprinter
sals: bin/sals
swMatcher: bin/swMatcher
kbandMatcher: bin/kbandMatcher
sawriter: bin/sawriter
saquery: bin/saquery
samodify: bin/samodify
printTupleCountTable:bin/printTupleCountTable
cmpPrintTupleCountTable:bin/cmpPrintTupleCountTable
malign: bin/malign
removeAdapters: bin/removeAdapters
tabulateAlignment: bin/tabulateAlignment
buildQualityValueProfile: bin/buildQualityValueProfile
guidedalign: bin/guidedalign
extendAlign: bin/extendAlign
pbmask: bin/pbmask
bin/sawriter: bin/SAWriter.o
$(CPP) $(CPPOPTS) $< $(STATIC) -o $@
bin/pbmask: bin/Mask.o
$(CPP) $(CPPOPTS) $< $(STATIC) -o $@ -L$(HDF5LIBDIR) -l$(HDF5LIBCPP) -l$(HDF5LIB) -lz -lpthread
bin/guidedalign: bin/GuidedAlign.o
$(CPP) $(CPPOPTS) $^ $(STATIC) -o $@ -L$(HDF5LIBDIR) -l$(HDF5LIBCPP) -l$(HDF5LIB) -lz -lpthread
bin/extendAlign: bin/ExtendAlign.o
$(CPP) $(CPPOPTS) $^ $(STATIC) -o $@ -L$(HDF5LIBDIR) -l$(HDF5LIBCPP) -l$(HDF5LIB) -lz -lpthread
bin/saquery: bin/SAQuery.o
$(CPP) $(CPPOPTS) $^ $(STATIC) -o $@
bin/samodify: bin/SAModify.o
$(CPP) $(CPPOPTS) $^ $(STATIC) -o $@
bin/wordCounter: bin/WordCounter.o
$(CPP) $(CPPOPTS) $^ $(STATIC) -o $@
bin/printReadWordCount: bin/PrintReadWordCount.o
$(CPP) $(CPPOPTS) $^ $(STATIC) -o $@
#
# Add nonstandard compilation rules to build compressed dna aligners
#
bin/CmpReadMatcher.o: ReadMatcher.cpp
$(CPP) -c $(CPPOPTS) -DCOMPRESSED $< $(STATIC) -o $@
bin/cmpMatcher: bin/CmpReadMatcher.o bin/PositionTable.o
$(CPP) $(CPPOPTS) $< bin/PositionTable.o $(STATIC) -o $@ -L$(HDF5LIBDIR) -l$(HDF5LIBCPP) -l$(HDF5LIB) -lz -lpthread
bin/CmpPrintTupleCountTable.o: PrintTupleCountTable.cpp
$(CPP) -c $(CPPOPTS) -DCOMPRESSED $^ $(STATIC) -o $@
bin/cmpPrintTupleCountTable: bin/CmpPrintTupleCountTable.o
$(CPP) $(CPPOPTS) $< $(STATIC) -o $@
ifneq ($(shell uname -s),Darwin)
LRT = -lrt
endif
bin/blasr: bin/Blasr.o
$(CPP) $(CPPOPTS) $< -L$(HDF5LIBDIR) -l$(HDF5LIBCPP) -l$(HDF5LIB) $(LINK_PROFILER) -lpthread -lz $(LRT) -ldl $(STATIC) -o bin/blasr
bin/samatcher: bin/SAMatcher.o
$(CPP) $(CPPOPTS) $< $(STATIC) -o $@
bin/saprinter: bin/SAPrinter.o
$(CPP) $(CPPOPTS) $< $(STATIC) -o $@
bin/sals: bin/SALS.o
$(CPP) $(CPPOPTS) $< $(STATIC) -o $@
bin/sdpMatcher: bin/SDPMatcher.o
$(CPP) $(CPPOPTS) $< $(STATIC) -o $@
bin/malign: bin/MAlign.o
$(CPP) $(CPPOPTS) $< $(STATIC) -o $@
bin/swMatcher: bin/SWMatcher.o
$(CPP) $(CPPOPTS) $< $(STATIC) -o $@
bin/removeAdapters: bin/RemoveAdapters.o
$(CPP) $(CPPOPTS) $< $(STATIC) -o $@
bin/kbandMatcher: bin/KBandMatcher.o
$(CPP) $(CPPOPTS) $< $(STATIC) -o $@
bin/printTupleCountTable: bin/PrintTupleCountTable.o
$(CPP) $(CPPOPTS) $< $(STATIC) -o $@
bin/tabulateAlignment: bin/TabulateAlignment.o
$(CPP) $(CPPOPTS) $< $(STATIC) -o $@
bin/buildQualityValueProfile: bin/BuildQualityValueProfile.o
$(CPP) -g $(CPPOPTS) $< $(STATIC) -o $@ -L$(HDF5LIBDIR) -l$(HDF5LIBCPP) -l$(HDF5LIB) -lz -lpthread
#
# Set up a default value for the install dir if one does
# not exist.
#
INSTALL_DIR ?= $(ANALYSIS_HOME)/bin
BUILD_DIR ?= bin
install:
/usr/bin/install -d $(INSTALL_DIR)
/usr/bin/install -m 555 $(DISTRIB_EXECS) $(INSTALL_DIR)
install-%:
/usr/bin/install -d $(INSTALL_DIR)
/usr/bin/install -m 555 $(DISTRIB_EXECS) $(INSTALL_DIR)
make
ln -s /usr/local/blasr-master/alignment/bin/blasr /usr/local/bin/
chown -R root:root /usr/local/blasr-master
* Test
blasr -h
Options for blasr
Basic usage: 'blasr reads.{fasta,bas.h5} genome.fasta [-options]
option Description (default_value).
Input Files.
reads.fasta is a multi-fasta file of reads. While any fasta file is valid input,
it is preferable to use pls.h5 or bas.h5 files because they contain
more rich quality value information.
reads.bas.h5|reads.pls.h5 Is the native output format in Hierarchical Data Format of
SMRT reads. This is the preferred input to blasr because rich quality
value (insertion,deletion, and substitution quality values) information is
maintained. The extra quality information improves variant detection and mapping
speed.
-sa suffixArrayFile
Use the suffix array 'sa' for detecting matches
between the reads and the reference. The suffix
array has been prepared by the sawriter program.
-ctab tab
A table of tuple counts used to estimate match significance. This is
by the program 'printTupleCountTable'. While it is quick to generate on
the fly, if there are many invocations of blasr, it is useful to
precompute the ctab.
-regionTable table
Read in a read-region table in HDF format for masking portions of reads.
This may be a single table if there is just one input file.
or a fofn. When a region table is specified, any region table inside
the reads.bas.h5 or reads.bas.h5 files are ignored.
Options for modifying reads. There is ancilliary information about substrings of reads
that is stored in a 'region table' for each read file. Because
HDF is used, the region table may be part of the .bas.h5 or .pls.h5 file,
or a separate file. A contiguously read substring from the template
is a subread, and any read may contain multiple subreads. The boundaries
of the subreads may be inferred from the region table either directly or
by definition of adapter boundaries. Typically region tables also
contain information for the location of the high and low quality regions of
reads. Reads produced by spurrious reads from empty ZMWs have a high
quality start coordinate equal to high quality end, making no usable read.
-useccs
Align the circular consensus sequence (ccs), then report alignments
of the ccs subreads to the window that the ccs was mapped to. Only
alignments of the subreads are reported.
-useccsall
Similar to -useccs, except all subreads are aligned, rather than just
the subreads used to call the ccs. This will include reads that only
cover part of the template.
-useccsdenovo
Align the circular consensus, and report only the alignment of the ccs
sequence.
-noSplitSubreads (false)
Do not split subreads at adapters. This is typically only
useful when the genome in an unrolled version of a known template, and
contains template-adapter-reverse_template sequence.
-ignoreRegions(false)
Ignore any information in the region table.
-ignoreHQRegions (false)Ignore any hq regions in the region table.
Alignment Output.
-bestn n (10)
Report the top 'n' alignments.
-sam Write output in SAM format.
-clipping [none|hard|soft] (none)
Use no/hard/soft clipping for SAM output.
-out out (terminal)
Write output to 'out'.
-unaligned file
Output reads that are not aligned to 'file'
-m t
If not printing SAM, modify the output of the alignment.
t=0 Print blast like output with |'s connecting matched nucleotides.
1 Print only a summary: score and pos.
2 Print in Compare.xml format.
3 Print in vulgar format (deprecated).
4 Print a longer tabular version of the alignment.
5 Print in a machine-parsable format that is read by compareSequences.py.
-noSortRefinedAlignments (false)
Once candidate alignments are generated and scored via sparse dynamic
programming, they are rescored using local alignment that accounts
for different error profiles.
Resorting based on the local alignment may change the order the hits are returned.
-allowAdjacentIndels
When specified, adjacent insertion or deletions are allowed. Otherwise, adjacent
insertion and deletions are merged into one operation. Using quality values
to guide pairwise alignments may dictate that the higher probability alignment
contains adjacent insertions or deletions. Current tools such as GATK do not permit
this and so they are not reported by default.
-header
Print a header as the first line of the output file describing the contents of each column.
-titleTable tab (NULL)
Construct a table of reference sequence titles. The reference sequences are
enumerated by row, 0,1,... The reference index is printed in alignment results
rather than the full reference name. This makes output concise, particularly when
very verbose titles exist in reference names.
-minPctIdentity p (0)
Only report alignments if they are greater than p percent identity.
-unaligned file
Output reads that are not aligned to 'file'.
-holeNumbers LIST
When specified, only align reads whose ZMW hole numbers are in LIST.
LIST is a comma-delimited string of ranges, such as '1,2,3,10-13'.
This option only works when reads are in base or pulse h5 format.
-placeRepeatsRandomly (false)
When there are multiple positions to map a read with equal alignment scores, place the
read randomly at one of them.
Options for anchoring alignment regions. This will have the greatest effect on speed and sensitivity.
-minMatch m (10)
Minimum seed length. Higher minMatch will speed up alignment,
but decrease sensitivity.
-maxMatch l (inf)
Stop mapping a read to the genome when the lcp length reaches l.
This is useful when the query is part of the reference, for example when
constructing pairwise alignments for de novo assembly.
-maxLCPLength l (inf)
The same as -maxMatch.
-maxAnchorsPerPosition m (inf)
Do not add anchors from a position if it matches to more than 'm' locations in the target.
-advanceExactMatches E (0)
Another trick for speeding up alignments with match - E fewer anchors. Rather than
finding anchors between the read and the genome at every position in the read,
when an anchor is found at position i in a read of length L, the next position
in a read to find an anchor is at i+L-E.
Use this when alignining already assembled contigs.
-nCandidates n (10)
Keep up to 'n' candidates for the best alignment. A large value of n will slow mapping
because the slower dynamic programming steps are applied to more clusters of anchors
which can be a rate limiting step when reads are very long.
-mapSubreadsOfZmwTogether(false)
Map all subreads of a zmw (hole) to where the longest subread of the zmw
aligned to. This requires to use the region table and hq regions.
This option only works when reads are in base or pulse h5 format.
Options for Refining Hits.
-sdpTupleSize K (11)
Use matches of length K to speed dynamic programming alignments. This controls
accuracy of assigning gaps in pairwise alignments once a mapping has been found,
rather than mapping sensitivity itself.
-scoreMatrix "score matrix string"
Specify an alternative score matrix for scoring fasta reads. The matrix is
in the format
ACGTN
A abcde
C fghij
G klmno
T pqrst
N uvwxy . The values a...y should be input as a quoted space separated
string: "a b c ... y". Lower scores are better, so matches should be less
than mismatches e.g. a,g,m,s = -5 (match), mismatch = 6.
-affineExtend a (5)
Change affine (extension) gap penalty. Lower value allows more gaps.
Options for overlap/dynamic programming alignments and pairwise overlap for de novo assembly.
-useQuality (false)
Use substitution/insertion/deletion/merge quality values to score gap and
mismatch penalties in pairwise alignments. Because the insertion and deletion
rates are much higher than substitution, this will make many alignments
favor an insertion/deletion over a substitution. Naive consensus calling methods
will then often miss substitution polymorphisms. This option should be
used when calling consensus using the Quiver method. Furthermore, when
not using quality values to score alignments, there will be a lower consensus
accuracy in homolymer regions.
-affineAlign (false)
Refine alignment using affine guided align.
Options for filtering reads.
-minReadLength l(50)
Skip reads that have a full length less than l. Subreads may be shorter.
-minSubreadLength l(0)
Do not align subreads of length less than l.
-maxScore m (0)
Maximum score to output (high is bad, negative good).
Options for parallel alignment.
-nproc N (1)
Align using N processes. All large data structures such as the suffix array and
tuple count table are shared.
-start S (0)
Index of the first read to begin aligning. This is useful when multiple instances
are running on the same data, for example when on a multi-rack cluster.
-stride S (1)
Align one read every 'S' reads.
Options for subsampling reads.
-subsample (0)
Proportion of reads to randomly subsample (expressed as a decimal) and align.
-v Print some verbose information.
-V 2 Make verbosity more verbose. Probably only useful for development.
-h Print this help file.
To cite BLASR, please use: Chaisson M.J., and Tesler G., Mapping
single molecule sequencing reads using Basic Local Alignment with
Successive Refinement (BLASR): Theory and Application, BMC
Bioinformatics 2012, 13:238 .