Skip to content
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension


Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
8 changes: 4 additions & 4 deletions machine/corpora/__init__.py
Original file line number Diff line number Diff line change
Expand Up @@ -28,12 +28,12 @@
from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
from .paratext_project_terms_parser_base import KeyTerm, ParatextProjectTermsParserBase
from .paratext_project_text_updater_base import ParatextProjectTextUpdaterBase
from .paratext_project_versification_error_detector import ParatextProjectVersificationErrorDetector
from .paratext_project_versification_error_detector_base import ParatextProjectVersificationErrorDetectorBase
from .paratext_text_corpus import ParatextTextCorpus
from .place_markers_usfm_update_block_handler import PlaceMarkersAlignmentInfo, PlaceMarkersUsfmUpdateBlockHandler
from .scripture_element import ScriptureElement
from .scripture_ref import EMPTY_SCRIPTURE_REF, ScriptureRef
from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler, ScriptureTextType
from .scripture_ref_usfm_parser_handler_base import ScriptureRefUsfmParserHandlerBase, ScriptureTextType
from .scripture_text_corpus import (
ScriptureTextCorpus,
create_versification_ref_corpus,
Expand Down Expand Up @@ -139,15 +139,15 @@
"ParatextProjectSettingsParserBase",
"ParatextProjectTermsParserBase",
"ParatextProjectTextUpdaterBase",
"ParatextProjectVersificationErrorDetector",
"ParatextProjectVersificationErrorDetectorBase",
"ParatextTextCorpus",
"parse_usfm",
"PlaceMarkersAlignmentInfo",
"PlaceMarkersUsfmUpdateBlockHandler",
"RtlReferenceOrder",
"ScriptureElement",
"ScriptureRef",
"ScriptureRefUsfmParserHandler",
"ScriptureRefUsfmParserHandlerBase",
"ScriptureTextCorpus",
"ScriptureTextType",
"StandardParallelTextCorpus",
Expand Down
18 changes: 15 additions & 3 deletions machine/corpora/file_paratext_project_file_handler.py
Original file line number Diff line number Diff line change
@@ -1,3 +1,4 @@
import os
from pathlib import Path
from typing import BinaryIO, Optional

Expand All @@ -11,17 +12,28 @@ def __init__(self, project_dir: StrPath) -> None:
self._project_dir = Path(project_dir)

def exists(self, file_name: str) -> bool:
return (self._project_dir / file_name).is_file()
for actual_file_name in os.listdir(self._project_dir):
if actual_file_name.lower() == file_name.lower():
return True
return False

def open(self, file_name: str) -> BinaryIO:
for actual_file_name in os.listdir(self._project_dir):
if actual_file_name.lower() == file_name.lower():
return open(self._project_dir / actual_file_name, "rb")
return open(self._project_dir / file_name, "rb")

def find(self, extension: str) -> Optional[Path]:
return next(self._project_dir.glob(f"*{extension}"), None)

def create_stylesheet(self, file_name: str) -> UsfmStylesheet:
custom_stylesheet_filename = self._project_dir / "custom.sty"
custom_stylesheet_file_name = "custom.sty"
for actual_file_name in os.listdir(self._project_dir):
if actual_file_name.lower() == custom_stylesheet_file_name:
custom_stylesheet_file_name = actual_file_name
break
custom_stylesheet_path = self._project_dir / custom_stylesheet_file_name
return UsfmStylesheet(
file_name,
custom_stylesheet_filename if custom_stylesheet_filename.is_file() else None,
custom_stylesheet_path if custom_stylesheet_path.is_file() else None,
)
Original file line number Diff line number Diff line change
@@ -1,10 +1,10 @@
from ..utils.typeshed import StrPath
from .file_paratext_project_file_handler import FileParatextProjectFileHandler
from .file_paratext_project_settings_parser import FileParatextProjectSettingsParser
from .paratext_project_versification_error_detector import ParatextProjectVersificationErrorDetector
from .paratext_project_versification_error_detector_base import ParatextProjectVersificationErrorDetectorBase


class FileParatextProjectVersificationErrorDetector(ParatextProjectVersificationErrorDetector):
class FileParatextProjectVersificationErrorDetector(ParatextProjectVersificationErrorDetectorBase):
def __init__(self, project_dir: StrPath) -> None:
super().__init__(
FileParatextProjectFileHandler(project_dir), FileParatextProjectSettingsParser(project_dir).parse()
Expand Down
4 changes: 2 additions & 2 deletions machine/corpora/paratext_project_settings.py
Original file line number Diff line number Diff line change
Expand Up @@ -53,9 +53,9 @@ def get_book_file_name(self, book_id: str) -> str:
book_part = _get_book_file_name_digits(book_id) + book_id
return self.file_name_prefix + book_part + self.file_name_suffix

def get_all_scripture_book_file_names(self) -> Iterable[str]:
def get_all_scripture_book_ids(self) -> Iterable[str]:
for book_id in get_scripture_books():
yield self.get_book_file_name(book_id)
yield book_id


def _get_book_file_name_digits(book_id: str) -> str:
Expand Down
Original file line number Diff line number Diff line change
@@ -1,13 +1,14 @@
from typing import List, Optional, Union
from typing import List, Optional, Set, Union

from ..scripture.canon import book_id_to_number
from .paratext_project_file_handler import ParatextProjectFileHandler
from .paratext_project_settings import ParatextProjectSettings
from .paratext_project_settings_parser_base import ParatextProjectSettingsParserBase
from .usfm_parser import parse_usfm
from .usfm_versification_error_detector import UsfmVersificationError, UsfmVersificationErrorDetector


class ParatextProjectVersificationErrorDetector:
class ParatextProjectVersificationErrorDetectorBase:
def __init__(
self,
paratext_project_file_handler: ParatextProjectFileHandler,
Expand All @@ -20,14 +21,19 @@ def __init__(
self._settings = settings

def get_usfm_versification_errors(
self,
handler: Optional[UsfmVersificationErrorDetector] = None,
self, handler: Optional[UsfmVersificationErrorDetector] = None, books: Optional[Set[int]] = None
) -> List[UsfmVersificationError]:
handler = handler or UsfmVersificationErrorDetector(self._settings)
for file_name in self._settings.get_all_scripture_book_file_names():
for book_id in self._settings.get_all_scripture_book_ids():

file_name = self._settings.get_book_file_name(book_id)

if not self._paratext_project_file_handler.exists(file_name):
continue

if books is not None and not book_id_to_number(book_id) in books:
continue

with self._paratext_project_file_handler.open(file_name) as sfm_file:
usfm: str = sfm_file.read().decode(self._settings.encoding)
try:
Expand Down
Original file line number Diff line number Diff line change
Expand Up @@ -22,10 +22,14 @@ class ScriptureTextType(Enum):


def _is_embed_style(marker: Optional[str]) -> bool:
return marker is not None and (marker.strip("*") in _EMBED_STYLES or marker.startswith("z"))
return marker is not None and marker.strip("*") in _EMBED_STYLES


class ScriptureRefUsfmParserHandler(UsfmParserHandler, ABC):
def is_private_use_marker(marker: str):
return marker is not None and marker.startswith("z")


class ScriptureRefUsfmParserHandlerBase(UsfmParserHandler, ABC):
def __init__(self) -> None:
self._cur_verse_ref: VerseRef = VerseRef()
self._cur_elements_stack: List[ScriptureElement] = []
Expand All @@ -46,22 +50,29 @@ def chapter(self, state: UsfmParserState, number: str, marker: str, alt_number:
def verse(
self, state: UsfmParserState, number: str, marker: str, alt_number: Optional[str], pub_number: Optional[str]
) -> None:
if state.verse_ref == self._cur_verse_ref and not self._duplicate_verse:
self._end_verse_text(state, self._create_verse_refs())
# ignore duplicate verses
self._duplicate_verse = True
# Non-latin numbers are implicitly handled

if state.chapter_has_verse_zero and state.verse_ref.verse_num == 0:
# Fall through for the special case of verse 0 being specified in the USFM
pass
elif state.verse_ref == self._cur_verse_ref and not self._duplicate_verse:
if state.verse_ref.verse_num > 0:
self._end_verse_text(state, self._create_verse_refs())
# ignore duplicate verses
self._duplicate_verse = True
return
elif are_overlapping_verse_ranges(verse1=number, verse2=self._cur_verse_ref.verse):
# merge overlapping verse ranges in to one range
verse_ref: VerseRef = self._cur_verse_ref.copy()
verse_ref.verse = merge_verse_ranges(number, self._cur_verse_ref.verse)
self._update_verse_ref(verse_ref, marker)
return
if self._current_text_type == ScriptureTextType.NONVERSE:
self._end_non_verse_text_wrapper(state)
else:
if self._current_text_type == ScriptureTextType.NONVERSE:
self._end_non_verse_text_wrapper(state)
elif self._current_text_type == ScriptureTextType.VERSE:
self._end_verse_text_wrapper(state)
self._update_verse_ref(state.verse_ref, marker)
self._start_verse_text_wrapper(state)
self._end_verse_text_wrapper(state)
self._update_verse_ref(state.verse_ref, marker)
self._start_verse_text_wrapper(state)

def start_para(
self,
Expand All @@ -70,13 +81,21 @@ def start_para(
unknown: Optional[bool],
attributes: Optional[Sequence[UsfmAttribute]],
) -> None:
# ignore private-use markers
if is_private_use_marker(marker):
return

if self._cur_verse_ref.is_default:
self._update_verse_ref(state.verse_ref, marker)
if not state.is_verse_text:
self._start_parent_element(marker)
self._start_non_verse_text_wrapper(state)

def end_para(self, state: UsfmParserState, marker: str) -> None:
# ignore private-use markers
if is_private_use_marker(marker):
return

if self._current_text_type == ScriptureTextType.NONVERSE:
self._end_parent_element()
self._end_non_verse_text_wrapper(state)
Expand Down Expand Up @@ -126,6 +145,10 @@ def opt_break(self, state: UsfmParserState) -> None:
def start_char(
self, state: UsfmParserState, marker: str, unknown: bool, attributes: Optional[Sequence[UsfmAttribute]]
) -> None:
# ignore private-use markers
if is_private_use_marker(marker):
return

# if we hit a character marker in a verse paragraph and we aren't in a verse, then start a non-verse segment
self._check_convert_verse_para_to_non_verse(state)

Expand All @@ -135,6 +158,10 @@ def start_char(
def end_char(
self, state: UsfmParserState, marker: str, attributes: Optional[Sequence[UsfmAttribute]], closed: bool
) -> None:
# ignore private-use markers
if is_private_use_marker(marker):
return

if _is_embed_style(marker):
self._end_embed_text_wrapper(state)

Expand Down Expand Up @@ -162,9 +189,9 @@ def _start_verse_text_wrapper(self, state: UsfmParserState) -> None:
self._start_verse_text(state, self._create_verse_refs())

def _end_verse_text_wrapper(self, state: UsfmParserState) -> None:
if not self._duplicate_verse and self._cur_verse_ref.verse_num > 0:
if not self._duplicate_verse and (self._cur_verse_ref.verse_num > 0 or state.chapter_has_verse_zero):
self._end_verse_text(state, self._create_verse_refs())
if self._cur_verse_ref.verse_num > 0:
if self._cur_verse_ref.verse_num > 0 or state.chapter_has_verse_zero:
self._cur_text_type_stack.pop()

def _start_non_verse_text_wrapper(self, state: UsfmParserState) -> None:
Expand All @@ -177,7 +204,17 @@ def _end_non_verse_text_wrapper(self, state: UsfmParserState) -> None:
self._cur_text_type_stack.pop()

def _update_verse_ref(self, verse_ref: VerseRef, marker: str) -> None:
if not are_overlapping_verse_ranges(verse_ref, self._cur_verse_ref):
if (
self._cur_verse_ref.verse_num == 0
and verse_ref.verse_num == 0
and not verse_ref.has_multiple
and marker == "v"
):
# As the verse 0 marker appears within the middle of verse 0,
# we should not break the position of current element stack by clearing it.
# Instead, we just need to pop the current element off the stack.
self._cur_elements_stack.pop()
elif not are_overlapping_verse_ranges(verse_ref, self._cur_verse_ref):
self._cur_elements_stack.clear()
self._cur_elements_stack.append(ScriptureElement(0, marker))
self._cur_verse_ref = verse_ref.copy()
Expand Down Expand Up @@ -239,6 +276,8 @@ def _check_convert_verse_para_to_non_verse(self, state: UsfmParserState) -> None
and para_tag.marker != "tr"
and state.is_verse_para
and self._cur_verse_ref.verse_num == 0
and not state.chapter_has_verse_zero
and not is_private_use_marker(para_tag.marker)
):
self._start_parent_element(para_tag.marker)
self._start_non_verse_text_wrapper(state)
22 changes: 20 additions & 2 deletions machine/corpora/update_usfm_parser_handler.py
Original file line number Diff line number Diff line change
Expand Up @@ -3,7 +3,7 @@

from ..scripture.verse_ref import IgnoreSegmentsVerseRef, VerseRef, Versification
from .scripture_ref import ScriptureRef
from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler, ScriptureTextType
from .scripture_ref_usfm_parser_handler_base import ScriptureRefUsfmParserHandlerBase, ScriptureTextType
from .usfm_parser_state import UsfmParserState
from .usfm_stylesheet import UsfmStylesheet
from .usfm_tag import UsfmTextType
Expand Down Expand Up @@ -38,7 +38,11 @@ def __init__(self, refs: Sequence[ScriptureRef], text: str, metadata: Optional[d
self.metadata = metadata


class UpdateUsfmParserHandler(ScriptureRefUsfmParserHandler):
def sanitize_verse_data(verse_data: str) -> str:
return verse_data.replace("\u200F", "")


class UpdateUsfmParserHandler(ScriptureRefUsfmParserHandlerBase):
def __init__(
self,
rows: Optional[Sequence[UpdateUsfmRow]] = None,
Expand Down Expand Up @@ -319,10 +323,16 @@ def _end_non_verse_text(self, state: UsfmParserState, scripture_ref: ScriptureRe
self._end_update_block(state, [scripture_ref])

def _end_embed_text(self, state: UsfmParserState, scripture_ref: ScriptureRef) -> None:
# If this embed is outside an update block, create an update block just for this embed
embed_outside_of_block = len(self._update_block_stack) == 0
if embed_outside_of_block:
self._start_update_block([scripture_ref])
self._update_block_stack[-1].add_embed(
self._embed_tokens, marked_for_removal=self._embed_behavior == UpdateUsfmMarkerBehavior.STRIP
)
self._embed_tokens.clear()
if embed_outside_of_block:
self._end_update_block(state, [scripture_ref])

def get_usfm(self, stylesheet: Union[str, UsfmStylesheet] = "usfm.sty") -> str:
if isinstance(stylesheet, str):
Expand All @@ -349,6 +359,12 @@ def _advance_rows(self, seg_scr_refs: Sequence[ScriptureRef]) -> Tuple[List[str]
row_texts: List[str] = []
row_metadata = None
source_index: int = 0

# handle the special case of verse 0, which although first in the rows,
# it will be retrieved some of other segments in the verse.
if len(seg_scr_refs) > 0 and seg_scr_refs[0].verse_num == 0 and len(seg_scr_refs[0].path) == 0:
self._verse_row_index = 0

while self._verse_row_index < len(self._verse_rows) and source_index < len(seg_scr_refs):
compare: int = 0
row = self._rows[self._verse_rows[self._verse_row_index]]
Expand Down Expand Up @@ -378,6 +394,8 @@ def _collect_updatable_tokens(self, state: UsfmParserState) -> None:
self._use_updated_text()
while self._token_index <= state.index + state.special_token_count:
token = state.tokens[self._token_index]
if token.type == UsfmTokenType.VERSE and token.data is not None:
token.data = sanitize_verse_data(token.data)
if self._current_text_type == ScriptureTextType.EMBED:
self._embed_tokens.append(token)
elif (
Expand Down
7 changes: 7 additions & 0 deletions machine/corpora/usfm_parser.py
Original file line number Diff line number Diff line change
Expand Up @@ -223,6 +223,8 @@ def process_token(self) -> bool:
verse_ref = self.state.verse_ref
verse_ref.chapter = token.data
verse_ref.verse_num = 0
self.state.chapter_has_verse_zero = False

# Verse offset is not zeroed for chapter 1, as it is part of intro
if verse_ref.chapter_num != 1:
self.state.verse_offset = 0
Expand Down Expand Up @@ -261,7 +263,12 @@ def process_token(self) -> bool:

assert token.data is not None
verse_ref = self.state.verse_ref
prev_verse_num = verse_ref.verse_num
verse_ref.verse = token.data
if verse_ref.verse_num == 0: # This token is \v 0
self.state.chapter_has_verse_zero = True
elif verse_ref.verse_num == -1: # Ignore invalid verse numbers
verse_ref.verse_num = prev_verse_num
self.state.verse_offset = 0

if self.handler is not None:
Expand Down
5 changes: 3 additions & 2 deletions machine/corpora/usfm_parser_state.py
Original file line number Diff line number Diff line change
Expand Up @@ -37,6 +37,7 @@ def __init__(self, stylesheet: UsfmStylesheet, versification: Versification, tok
self._tokens = tokens
self.index = -1
self.special_token = False
self.chapter_has_verse_zero = False
self._special_token_count: int = 0

@property
Expand Down Expand Up @@ -108,8 +109,8 @@ def is_verse_para(self) -> bool:

@property
def is_verse_text(self) -> bool:
# anything before verse 1 is not verse text
if self.verse_ref.verse_num == 0:
# anything before verse 1 is not verse text, unless the USFM specified verse 0
if self.verse_ref.verse_num == 0 and not self.chapter_has_verse_zero:
return False

# Sidebars and notes are not verse text
Expand Down
4 changes: 2 additions & 2 deletions machine/corpora/usfm_text_base.py
Original file line number Diff line number Diff line change
Expand Up @@ -7,7 +7,7 @@
from ..utils.string_utils import has_sentence_ending
from .corpora_utils import gen
from .scripture_ref import ScriptureRef
from .scripture_ref_usfm_parser_handler import ScriptureRefUsfmParserHandler, ScriptureTextType
from .scripture_ref_usfm_parser_handler_base import ScriptureRefUsfmParserHandlerBase, ScriptureTextType
from .scripture_text import ScriptureText
from .stream_container import StreamContainer
from .text_row import TextRow
Expand Down Expand Up @@ -76,7 +76,7 @@ def _read_usfm(self) -> str:
return reader.read()


class _TextRowCollector(ScriptureRefUsfmParserHandler):
class _TextRowCollector(ScriptureRefUsfmParserHandlerBase):
def __init__(self, text: UsfmTextBase) -> None:
super().__init__()

Expand Down
Loading