diff --git a/README.md b/README.md index 211e818..e540b53 100644 --- a/README.md +++ b/README.md @@ -158,6 +158,10 @@ Each partition takes into account a fraction of its neighboring partitions k-mer Modimizer sketch size. Must be lower than window size `w`. A lower sketch size means less k-mers to compare (and faster runtime), at the expense of lower accuracy. Recommended to be kept >= 1000. +`--forward ` + +Use forward k-mers only, instead of the default of canonical k-mers. Warning: this will give strand specific output. + `-r / --resolution ` Dotplot resolution. This corresponds to the number of windows each input sequence is partitioned into. Default is 1000. Overrides the `--window` parameter. diff --git a/pyproject.toml b/pyproject.toml index 18d4ea5..87556b2 100755 --- a/pyproject.toml +++ b/pyproject.toml @@ -4,7 +4,7 @@ build-backend = "setuptools.build_meta" [project] name = "ModDotPlot" -version = "0.9.8" +version = "0.9.9" requires-python = ">= 3.7" dependencies = [ "pysam", diff --git a/src/moddotplot/const.py b/src/moddotplot/const.py index 1453d31..692ee54 100755 --- a/src/moddotplot/const.py +++ b/src/moddotplot/const.py @@ -1,4 +1,4 @@ -VERSION = "0.9.8" +VERSION = "0.9.9" COLS = [ "#query_name", "query_start", diff --git a/src/moddotplot/moddotplot.py b/src/moddotplot/moddotplot.py index 2b5184b..9667ce7 100755 --- a/src/moddotplot/moddotplot.py +++ b/src/moddotplot/moddotplot.py @@ -316,6 +316,18 @@ def get_parser(): type=str, ) + static_parser.add_argument( + "--forward", + action="store_true", + help="Enforce forward only k-mers instead of canonical k-mers. Warning: only use if you want strand-specific output!", + ) + + static_parser.add_argument( + "--plot-direction", + action="store_true", + help="Create a plot containing the direction of each k-mer array (relative to the first array). Arrays with inversions will be highlighted in blue (forward) and pink (reverse).", + ) + static_parser.add_argument( "--colors", default=None, @@ -635,7 +647,10 @@ def main(): # -----------LOAD SEQUENCES INTO MEMORY----------- kmer_list = [] for i in fasta_list: - kmer_list.append(readKmersFromFile(i, args.kmer, False)) + if args.forward: + kmer_list.append(readKmersFromFile(i, args.kmer, False, True)) + else: + kmer_list.append(readKmersFromFile(i, args.kmer, False, False)) k_list = [item for sublist in kmer_list for item in sublist] # Throw error if compare only selected with one sequence. if len(k_list) < 2 and args.compare_only: diff --git a/src/moddotplot/parse_fasta.py b/src/moddotplot/parse_fasta.py index 2094a26..35a23af 100644 --- a/src/moddotplot/parse_fasta.py +++ b/src/moddotplot/parse_fasta.py @@ -37,7 +37,9 @@ def extractRegion(seq_name): return None -def generateKmersFromFasta(seq: Sequence[str], k: int, quiet: bool) -> Iterable[int]: +def generateKmersFromFasta( + seq: Sequence[str], k: int, quiet: bool, fw_only: bool +) -> Iterable[int]: n = len(seq) if not quiet: progress_thresholds = round(n / 77) @@ -60,11 +62,12 @@ def generateKmersFromFasta(seq: Sequence[str], k: int, quiet: bool) -> Iterable[ # Remove case sensitivity kmer = seq[i : i + k].upper() fh = mmh3.hash(kmer) - - # Calculate reverse complement hash directly without the need for translation - rc = mmh3.hash(kmer[::-1].translate(tab_b)) - - yield fh if fh < rc else rc + if fw_only: + yield fh + else: + # Calculate reverse complement hash directly without the need for translation + rc = mmh3.hash(kmer[::-1].translate(tab_b)) + yield fh if fh < rc else rc def isValidFasta(file_path): @@ -151,7 +154,9 @@ def printProgressBar( print() -def readKmersFromFile(filename: str, ksize: int, quiet: bool) -> List[List[int]]: +def readKmersFromFile( + filename: str, ksize: int, quiet: bool, fw_only: bool +) -> List[List[int]]: """ Given a filename and an integer k, returns a list of all k-mers found in the sequences in the file. """ @@ -161,7 +166,9 @@ def readKmersFromFile(filename: str, ksize: int, quiet: bool) -> List[List[int]] for seq_id in seq.references: print(f"Retrieving k-mers from {seq_id}.... \n") kmers_for_seq = [] - for kmer_hash in generateKmersFromFasta(seq.fetch(seq_id), ksize, quiet): + for kmer_hash in generateKmersFromFasta( + seq.fetch(seq_id), ksize, quiet, fw_only + ): kmers_for_seq.append(kmer_hash) all_kmers.append(kmers_for_seq) print(f"\n{seq_id} k-mers retrieved! \n")