BLASR

BLASR (Basic Local Alignment with Successive Refinement) rapidly maps reads to genomes by finding the highest scoring local alignment or set of local alignments between the read and the genome.
Optimized for PacBio's extraordinarily long reads and taking advantage of rich quality values, BLASR maps reads rapidly with high accuracy.

  • Install
apt-get install libhdf5-serial-dev

wget https://codeload.github.com/PacificBiosciences/blasr/zip/master
mv master blasr-master.zip
unzip blasr-master.zip
mv blasr-master /usr/local/
cd /usr/local/blasr-master/

Edit alignment/Makefile to solve compiling error due to -lpthread flag position:

Makefile
#
# common.mk contains the configuration for this build setup.
#
#
ifeq ($(origin PREFIX), undefined)
  ifeq ($(origin SEYMOUR_HOME), undefined)
    PREFIX = /opt/seymour
  else
    PREFIX = $(SEYMOUR_HOME)
  endif
endif
ANALYSIS_HOME=$(PREFIX)/analysis
 
PBCPP_DIR = ..
 
include ../common.mk
 
#
# Define the targets before including the rules since the rules contains a target itself.
#
 
EXECS = wordCounter printReadWordCount blasr sdpMatcher swMatcher kbandMatcher sawriter saquery samodify printTupleCountTable cmpPrintTupleCountTable malign removeAdapters tabulateAlignment samatcher saprinter buildQualityValueProfile guidedalign extendAlign sals pbmask
 
# DISABLE for now
#cmpMatcher
 
 
all: bin make.dep $(EXECS)
 
BUILTEXECS = $(addprefix bin/, $(EXECS))
DISTRIB_SET = blasr swMatcher kbandMatcher sawriter samodify printTupleCountTable cmpPrintTupleCountTable removeAdapters sdpMatcher pbmask
DISTRIB_EXECS = $(addprefix bin/, $(DISTRIB_SET))
INSTALL_EXECS = $(addprefix install-, $(DISTRIB_SET))
 
 
include ../make.rules
 
ifneq ($(wildcard make.dep),)
    include make.dep
endif
 
wordCounter:        bin/wordCounter
printReadWordCount: bin/printReadWordCount
blasr:        bin/blasr
cmpMatcher:         bin/cmpMatcher
sdpMatcher:         bin/sdpMatcher
samatcher:          bin/samatcher
saprinter:          bin/saprinter
sals:               bin/sals
swMatcher:          bin/swMatcher
kbandMatcher:       bin/kbandMatcher
sawriter:           bin/sawriter
saquery:            bin/saquery
samodify:           bin/samodify
printTupleCountTable:bin/printTupleCountTable
cmpPrintTupleCountTable:bin/cmpPrintTupleCountTable
malign:	            bin/malign
removeAdapters:     bin/removeAdapters
tabulateAlignment:  bin/tabulateAlignment
buildQualityValueProfile: bin/buildQualityValueProfile
guidedalign: bin/guidedalign
extendAlign: bin/extendAlign
pbmask: bin/pbmask
 
bin/sawriter: bin/SAWriter.o
	$(CPP) $(CPPOPTS) $< $(STATIC) -o $@
 
bin/pbmask: bin/Mask.o
	$(CPP) $(CPPOPTS) $< $(STATIC) -o $@ -L$(HDF5LIBDIR) -l$(HDF5LIBCPP) -l$(HDF5LIB) -lz -lpthread
 
bin/guidedalign: bin/GuidedAlign.o
	$(CPP) $(CPPOPTS) $^ $(STATIC) -o $@ -L$(HDF5LIBDIR) -l$(HDF5LIBCPP) -l$(HDF5LIB) -lz -lpthread
 
bin/extendAlign: bin/ExtendAlign.o
	$(CPP) $(CPPOPTS) $^ $(STATIC) -o $@ -L$(HDF5LIBDIR) -l$(HDF5LIBCPP) -l$(HDF5LIB) -lz -lpthread
 
bin/saquery: bin/SAQuery.o
	$(CPP) $(CPPOPTS) $^ $(STATIC) -o $@
 
bin/samodify: bin/SAModify.o
	$(CPP) $(CPPOPTS) $^ $(STATIC) -o $@
 
bin/wordCounter: bin/WordCounter.o
	$(CPP) $(CPPOPTS) $^ $(STATIC) -o $@
 
bin/printReadWordCount: bin/PrintReadWordCount.o
	$(CPP) $(CPPOPTS) $^ $(STATIC) -o $@
 
#
# Add nonstandard compilation rules to build compressed dna aligners
#
 
bin/CmpReadMatcher.o: ReadMatcher.cpp
	$(CPP) -c $(CPPOPTS) -DCOMPRESSED $< $(STATIC) -o $@
 
bin/cmpMatcher: bin/CmpReadMatcher.o bin/PositionTable.o
	$(CPP) $(CPPOPTS) $< bin/PositionTable.o $(STATIC) -o $@ -L$(HDF5LIBDIR) -l$(HDF5LIBCPP) -l$(HDF5LIB) -lz -lpthread
 
bin/CmpPrintTupleCountTable.o: PrintTupleCountTable.cpp
	$(CPP) -c $(CPPOPTS) -DCOMPRESSED $^ $(STATIC) -o $@
 
bin/cmpPrintTupleCountTable: bin/CmpPrintTupleCountTable.o
	$(CPP) $(CPPOPTS) $< $(STATIC) -o $@
 
ifneq ($(shell uname -s),Darwin)
    LRT = -lrt
endif
bin/blasr: bin/Blasr.o
	$(CPP) $(CPPOPTS) $< -L$(HDF5LIBDIR) -l$(HDF5LIBCPP) -l$(HDF5LIB) $(LINK_PROFILER) -lpthread -lz $(LRT) -ldl $(STATIC) -o bin/blasr
 
bin/samatcher: bin/SAMatcher.o
	$(CPP) $(CPPOPTS) $< $(STATIC) -o $@
 
bin/saprinter: bin/SAPrinter.o
	$(CPP) $(CPPOPTS) $< $(STATIC) -o $@
 
bin/sals: bin/SALS.o
	$(CPP) $(CPPOPTS) $< $(STATIC) -o $@
 
bin/sdpMatcher: bin/SDPMatcher.o
	$(CPP) $(CPPOPTS) $< $(STATIC) -o $@
 
bin/malign: bin/MAlign.o
	$(CPP) $(CPPOPTS) $< $(STATIC) -o $@
 
bin/swMatcher: bin/SWMatcher.o
	$(CPP) $(CPPOPTS) $< $(STATIC) -o $@
 
bin/removeAdapters: bin/RemoveAdapters.o
	$(CPP) $(CPPOPTS) $< $(STATIC) -o $@
 
bin/kbandMatcher: bin/KBandMatcher.o
	$(CPP) $(CPPOPTS) $< $(STATIC) -o $@
 
bin/printTupleCountTable: bin/PrintTupleCountTable.o
	$(CPP) $(CPPOPTS) $< $(STATIC) -o $@
 
bin/tabulateAlignment: bin/TabulateAlignment.o
	$(CPP) $(CPPOPTS) $< $(STATIC) -o $@
 
bin/buildQualityValueProfile: bin/BuildQualityValueProfile.o
	$(CPP) -g $(CPPOPTS) $< $(STATIC) -o $@ -L$(HDF5LIBDIR) -l$(HDF5LIBCPP) -l$(HDF5LIB) -lz -lpthread
 
#
# Set up a default value for the install dir if one does
# not exist.
#
INSTALL_DIR ?= $(ANALYSIS_HOME)/bin
BUILD_DIR ?= bin
install:
	/usr/bin/install -d $(INSTALL_DIR)
	/usr/bin/install -m 555 $(DISTRIB_EXECS) $(INSTALL_DIR)
 
install-%:
	/usr/bin/install -d $(INSTALL_DIR)
	/usr/bin/install -m 555 $(DISTRIB_EXECS) $(INSTALL_DIR)
make
ln -s /usr/local/blasr-master/alignment/bin/blasr /usr/local/bin/
chown -R root:root /usr/local/blasr-master
  • Test
blasr -h
   Options for blasr
   Basic usage: 'blasr reads.{fasta,bas.h5} genome.fasta [-options]
 option Description (default_value).

 Input Files.
   reads.fasta is a multi-fasta file of reads.  While any fasta file is valid input,
               it is preferable to use pls.h5 or bas.h5 files because they contain
               more rich quality value information.

   reads.bas.h5|reads.pls.h5 Is the native output format in Hierarchical Data Format of
               SMRT reads. This is the preferred input to blasr because rich quality
               value (insertion,deletion, and substitution quality values) information is
               maintained.  The extra quality information improves variant detection and mapping
               speed.

   -sa suffixArrayFile
               Use the suffix array 'sa' for detecting matches
               between the reads and the reference.  The suffix
               array has been prepared by the sawriter program.

   -ctab tab
               A table of tuple counts used to estimate match significance.  This is
               by the program 'printTupleCountTable'.  While it is quick to generate on
               the fly, if there are many invocations of blasr, it is useful to
               precompute the ctab.

   -regionTable table
               Read in a read-region table in HDF format for masking portions of reads.
               This may be a single table if there is just one input file.
               or a fofn.  When a region table is specified, any region table inside
               the reads.bas.h5 or reads.bas.h5 files are ignored.

 Options for modifying reads. There is ancilliary information about substrings of reads
               that is stored in a 'region table' for each read file.  Because
               HDF is used, the region table may be part of the .bas.h5 or .pls.h5 file,
               or a separate file.  A contiguously read substring from the template
               is a subread, and any read may contain multiple subreads. The boundaries
               of the subreads may be inferred from the region table either directly or
               by definition of adapter boundaries.  Typically region tables also
               contain information for the location of the high and low quality regions of
               reads.  Reads produced by spurrious reads from empty ZMWs have a high
               quality start coordinate equal to high quality end, making no usable read.
   -useccs
               Align the circular consensus sequence (ccs), then report alignments
               of the ccs subreads to the window that the ccs was mapped to.  Only
               alignments of the subreads are reported.
   -useccsall
               Similar to -useccs, except all subreads are aligned, rather than just
               the subreads used to call the ccs.  This will include reads that only
               cover part of the template.
   -useccsdenovo
               Align the circular consensus, and report only the alignment of the ccs
               sequence.
   -noSplitSubreads (false)
               Do not split subreads at adapters.  This is typically only
               useful when the genome in an unrolled version of a known template, and
               contains template-adapter-reverse_template sequence.
   -ignoreRegions(false)
               Ignore any information in the region table.
   -ignoreHQRegions (false)Ignore any hq regions in the region table.

 Alignment Output.
   -bestn n (10)
               Report the top 'n' alignments.
   -sam        Write output in SAM format.
   -clipping [none|hard|soft] (none)
               Use no/hard/soft clipping for SAM output.
   -out out (terminal)
               Write output to 'out'.
   -unaligned file
               Output reads that are not aligned to 'file'
   -m t
               If not printing SAM, modify the output of the alignment.
                t=0 Print blast like output with |'s connecting matched nucleotides.
                  1 Print only a summary: score and pos.
                  2 Print in Compare.xml format.
                  3 Print in vulgar format (deprecated).
                  4 Print a longer tabular version of the alignment.
                  5 Print in a machine-parsable format that is read by compareSequences.py.
   -noSortRefinedAlignments (false)
               Once candidate alignments are generated and scored via sparse dynamic
               programming, they are rescored using local alignment that accounts
               for different error profiles.
               Resorting based on the local alignment may change the order the hits are returned.
   -allowAdjacentIndels
               When specified, adjacent insertion or deletions are allowed. Otherwise, adjacent
               insertion and deletions are merged into one operation.  Using quality values
               to guide pairwise alignments may dictate that the higher probability alignment
               contains adjacent insertions or deletions.  Current tools such as GATK do not permit
               this and so they are not reported by default.
   -header
               Print a header as the first line of the output file describing the contents of each column.
   -titleTable tab (NULL)
               Construct a table of reference sequence titles.  The reference sequences are
               enumerated by row, 0,1,...  The reference index is printed in alignment results
               rather than the full reference name.  This makes output concise, particularly when
               very verbose titles exist in reference names.
   -minPctIdentity p (0)
               Only report alignments if they are greater than p percent identity.
   -unaligned file
               Output reads that are not aligned to 'file'.
   -holeNumbers LIST
               When specified, only align reads whose ZMW hole numbers are in LIST.
               LIST is a comma-delimited string of ranges, such as '1,2,3,10-13'.
               This option only works when reads are in base or pulse h5 format.
   -placeRepeatsRandomly (false)
               When there are multiple positions to map a read with equal alignment scores, place the
               read randomly at one of them.

 Options for anchoring alignment regions. This will have the greatest effect on speed and sensitivity.
   -minMatch m (10)
               Minimum seed length.  Higher minMatch will speed up alignment,
               but decrease sensitivity.
   -maxMatch l (inf)
               Stop mapping a read to the genome when the lcp length reaches l.
               This is useful when the query is part of the reference, for example when
               constructing pairwise alignments for de novo assembly.
   -maxLCPLength l (inf)
               The same as -maxMatch.
   -maxAnchorsPerPosition m (inf)
               Do not add anchors from a position if it matches to more than 'm' locations in the target.
   -advanceExactMatches E (0)
               Another trick for speeding up alignments with match - E fewer anchors.  Rather than
               finding anchors between the read and the genome at every position in the read,
               when an anchor is found at position i in a read of length L, the next position
               in a read to find an anchor is at i+L-E.
               Use this when alignining already assembled contigs.
   -nCandidates n (10)
               Keep up to 'n' candidates for the best alignment.  A large value of n will slow mapping
               because the slower dynamic programming steps are applied to more clusters of anchors
               which can be a rate limiting step when reads are very long.
   -mapSubreadsOfZmwTogether(false)
               Map all subreads of a zmw (hole) to where the longest subread of the zmw
               aligned to. This requires to use the region table and hq regions.
               This option only works when reads are in base or pulse h5 format.

  Options for Refining Hits.
   -sdpTupleSize K (11)
               Use matches of length K to speed dynamic programming alignments.  This controls
               accuracy of assigning gaps in pairwise alignments once a mapping has been found,
               rather than mapping sensitivity itself.
   -scoreMatrix "score matrix string"
               Specify an alternative score matrix for scoring fasta reads.  The matrix is
               in the format
                  ACGTN
                A abcde
                C fghij
                G klmno
                T pqrst
                N uvwxy . The values a...y should be input as a quoted space separated
               string: "a b c ... y". Lower scores are better, so matches should be less
               than mismatches e.g. a,g,m,s = -5 (match), mismatch = 6.
   -affineExtend a (5)
               Change affine (extension) gap penalty. Lower value allows more gaps.

 Options for overlap/dynamic programming alignments and pairwise overlap for de novo assembly.
   -useQuality (false)
               Use substitution/insertion/deletion/merge quality values to score gap and
               mismatch penalties in pairwise alignments.  Because the insertion and deletion
               rates are much higher than substitution, this will make many alignments
               favor an insertion/deletion over a substitution.  Naive consensus calling methods
               will then often miss substitution polymorphisms. This option should be
               used when calling consensus using the Quiver method.  Furthermore, when
               not using quality values to score alignments, there will be a lower consensus
               accuracy in homolymer regions.
   -affineAlign (false)
               Refine alignment using affine guided align.

 Options for filtering reads.
   -minReadLength l(50)
               Skip reads that have a full length less than l. Subreads may be shorter.
   -minSubreadLength l(0)
               Do not align subreads of length less than l.
   -maxScore m (0)
               Maximum score to output (high is bad, negative good).

 Options for parallel alignment.
   -nproc N (1)
               Align using N processes.  All large data structures such as the suffix array and
               tuple count table are shared.
   -start S (0)
               Index of the first read to begin aligning. This is useful when multiple instances
               are running on the same data, for example when on a multi-rack cluster.
   -stride S (1)
               Align one read every 'S' reads.

 Options for subsampling reads.
   -subsample (0)
               Proportion of reads to randomly subsample (expressed as a decimal) and align.

 -v            Print some verbose information.
 -V 2          Make verbosity more verbose.  Probably only useful for development.
 -h            Print this help file.

To cite BLASR, please use: Chaisson M.J., and Tesler G., Mapping
single molecule sequencing reads using Basic Local Alignment with
Successive Refinement (BLASR): Theory and Application, BMC
Bioinformatics 2012, 13:238 .
 
 
rachaelx/blasr.txt ยท Last modified: 2013/07/04 15:36 by giancarlo

Developers: CNR Ceris IT Office and Library
Giancarlo Birello (g.birello _@_ ceris.cnr.it) and Anna Perin (a.perin _@_ ceris.cnr.it)
BioInfo@TO.CNR is licensed under: Creative Commons License
Recent changes RSS feed Creative Commons License Valid XHTML 1.0 Valid CSS Driven by DokuWiki
Drupal Garland Theme for Dokuwiki