BedBlastLift
The big guru has spoken some time ago and he said that tblastx is a very sensitive alignment tool, more than blastn or blat. You cannot run it on the whole genome, but you might give it a try on smaller sequences. However, blastToPsl cannot parse tblastx-output, as tblastx does not return real alignments (not sure here, but there seem to exist variable sized gaps "*" in the output), and without psl, not chain, no chain -> no liftover.
We can still simply take everything that overlaps an alignable region and map it to the WHOLE alignable region on the second sequence so as to give at least a hint where it should be roughly located. This is what the following scripts does. It needs the bed file from sequence1 and the tblastx output with option "-m 9" (tabular format) as input.
#!/usr/bin/python
from sys import *
from optparse import OptionParser
# === COMMAND LINE INTERFACE, OPTIONS AND HELP ===
parser = OptionParser("%prog [options] bedfile blastfile: Use a tblastx-tabular-file to lift annotations between sequences (approximatively), very slow")
#parser.add_option("-l", "--wordlength", dest="motiflength", action="store", help="length of word [default: %default, hexamers]", type="int", metavar="NUMBER", default="6")
#parser.add_option("-m", "--maf", dest="maf", action="store_true", help="force maf format [default: %default]", default=True)
#parser.add_option("-f", "--fasta", dest="maf", action="store_false", help="force fasta format (if not specified, we assume UCSC .maf file format")
(options, args) = parser.parse_args()
# ==== FUNCTIONs =====
class hit:
""" a hit from a blast tabular output file """
def __repr__(self):
return self.line
def parseBlast(file):
hits = []
for l in file:
if l.startswith("#"):
continue
fs = l.split()
if len(fs)!=12:
stderr.write("error: blast output does not have 12 fields. Make sure that you ran blast with -m 9 or -D T (for bl2seq)\n")
exit(1)
h = hit()
(h.qName, h.sName, h.percId, h.alnLen, h.mismatches, h.gapOpen, h.qStart, h.qEnd, h.sStart, h.sEnd, h.eVal, h.alnScore) = fs
h.sStart = int(h.sStart)
h.sEnd = int(h.sEnd)
h.qStart = int(h.qStart)
h.qEnd = int(h.qEnd)
if h.qEnd < h.qStart:
(h.qStart, h.qEnd) = (h.qEnd, h.qStart)
if h.sEnd < h.sStart:
(h.sStart, h.sEnd) = (h.sEnd, h.sStart)
h.line = l.strip()
hits.append(h)
return hits
class bed:
""" a feature from a bed file """
def overlaps(self, hit):
""" returns number of overlapping bp if two Features overlap or
false if no overlap """
if (( hit.sStart <= self.start and hit.sEnd > self.start) or \
(hit.sStart < self.end and hit.sEnd >= self.end) or
(hit.sStart > self.start and hit.sStart < self.end)):
return (min (hit.sEnd, self.end) - max (hit.sStart, self.start))
else:
return 0
def __repr__(self):
return self.line
def parseBed(file):
beds = []
for l in file:
l = l.strip()
b = bed()
if l.startswith("#"):
continue
fs = l.split()
if not len(fs) >= 4:
stderr.write("error: bed features need names.\n")
exit(1)
(b.chrom, b.start, b.end, b.name) = fs[0:4]
b.start = int(b.start)
b.end = int(b.end)
b.line = l
beds.append(b)
return beds
# ----------- MAIN --------------
if args==[]:
parser.print_help()
exit(1)
stderr.write("Reading blast output...\n")
f = open(args[0], "r")
hits = parseBlast(f)stderr.write("Searching for overlaps bed-blast...\n")
for b in beds:
overlaps = []
for h in hits:
length = b.overlaps(h)
if length > 0:
h.overlapLen=length
overlaps.append(h)
# choose highest scoring overlap
if len(overlaps)==0:
continue
overlaps.sort(key=lambda x: x.alnScore, reverse=True)
#for o in overlaps:
#print o
best = overlaps[0]
print best.qName, best.qStart, best.qEnd, b.name+"_ints"+str(best.overlapLen)
stderr.write("Reading bedfile ...\n")
f = open(args[1], "r")
beds = parseBed(f)
h.qStart = int(h.qStart)
h.qEnd = int(h.qEnd)
if h.qEnd < h.qStart:
(h.qStart, h.qEnd) = (h.qEnd, h.qStart)
if h.sEnd < h.sStart:
(h.sStart, h.sEnd) = (h.sEnd, h.sStart)
h.line = l.strip()
hits.append(h)
return hits