Skip to content
Open
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
70 changes: 69 additions & 1 deletion bin/CountSNPASE.py
Original file line number Diff line number Diff line change
Expand Up @@ -81,6 +81,24 @@
###############################################################################


###
#CARLO'S COMMENTS
#
#I can't remember when/if this function gets used, but I've since learned of a MUCH faster way
#to access a FASTA sequence without storing the whole stupid thing in memory, which will kill
#most applications on the human genome.
#
#First, the FASTA must be indexed with samtools:
#
# samtools faidx ref.fasta
#
#Then we can use pysam to return any 0-based sequence as follows:
#
# with pysam.FastaFile(ref.fasta, 'r') as in_fasta:
# seq = in_fasta.fetch(reference=CHROM, start=START, end=END)
#
#Because it's indexed and uses a C-API, it's very fast, and low mem.

def fasta_to_dict(file):
"""Convert a FASTA file to a dictionary.

Expand Down Expand Up @@ -483,6 +501,30 @@ def main(argv=None):
pos = chrom_to_num(line_t[0]) + '|' + str(line_t[2])
snps[pos] = line_t[3]

###
#CARLO'S COMMENTS
#
#Reading in BED/GTF/VCF files is a situation where pandas can make code easier to
#maintain/understand. It's not necessarily fewer lines, but it's easier to see what's
#going on. Also, when I wrote this script, I didn't realize that tuples can act as
#dictionary indices, which is MUCH better than the CHROM + '|' + POS construction.
#
#From working with computer programmers, I've learned that, as a general rule, it's bad
#form to use elements from split strings without first assigning them to variables.
#This is because it requires everyone to know the exact formating that args.snps will
#have. If they're first assigned to variables, or have columns indicated with pandas,
#as below, it's much clearer to everyone what's happening.
#
#(assuming import pandas as pd)
#
# snps = {}
# snp_file = pd.read_table(args.bed)
# snp_file.columns = (["CHROM","START","END","SNP"])
# for idx,row in snp_file.itterows():
# snps[(chrom_to_num(row["CHROM"]),int(row["END"]))] = row["SNP"]
#
###

# This is the dictionary of potential SNPs for each read.
potsnp_dict = {}

Expand All @@ -497,7 +539,15 @@ def main(argv=None):
snp_count = 0
ryo_filter = 0

for line in in_sam:
###
#CARLO'S COMMENTS
#
#This could be:
#
# with pysam.Samfile(args.reads,mode) as in_sam:
# for line in in_sam.fetch(until_eof=True):

for line in in_sam:
count += 1

# Skip lines that overlap indels OR don't match Ns
Expand All @@ -507,6 +557,24 @@ def main(argv=None):
indel_skip += 1
continue

###
#CARLO'S COMMENTS
#
#The newer version of pysam allows you to get variants from reads without having
#to manually parse tags and would replace a large amount of the code below
#
# for (read_pos,genome_pos,genome_seq) in read.get_aligned_pairs(with_seq=True):
# if genome_pos: #Is this base mapped?
# if ((line.reference_name,genome_pos)) in snps: #Assuming we've converted to tuples
#
# #This would be a good point to include a minimum quality score filter.
# if line.query_quality[read_pos] >= MIN_QUALITY:
#
# #The read sequence of the SNP will be:
# line.query_sequence[read_pos]
# #and can be added to the appropriate counter.
###

# Split the tags to find the MD tag:
tags = line.tags
for tagname, tagval in tags:
Expand Down