Source code for dynamicdl.processing.txtfile


from typing import Union, Optional, Any

from .._utils import load_config, union
from ..data.datatype import DataType
from ..data.datatypes import DataTypes
from ..parsing.static import Static
from ..parsing.generic import Generic
from ..parsing.alias import Alias
from .datafile import DataFile
from ..parsing.pairing import Pairing

config = load_config()

[docs] class TXTFile(DataFile): ''' The `TXTFile` class is an annotation object notator specifically for `.txt` file parsing. It also can parse anything that is represented in plaintext, i.e. with UTF-8 encoding. It takes a form similar to any nested dict structure, but it is also dangerous and should be noted that distinct lines must take distinct forms for differentiation and disambiguation. An example of a txt file that we want to parse: .. code-block:: imageset1 class1 image1 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 image2 2.0 3.0 5.6 2.43 image3 5.4 12.4 543.2 12.3 2.0 3.0 5.6 2.44 2.0 3.0 5.6 2.46 2.0 3.0 5.6 2.48 class2 image4 32.54 21.4 32.43 12.23 image5 imageset2 class1 image6 32.54 21.4 32.43 12.256 classes class1 abc class2 def class3 ghi Observe that each line can be distinctly classified in a hierarchical sense. That is, each individual line can be attributed to a single purpose. .. code-block:: python TXTFile({ Generic('imageset{}', DT.IMAGE_SET_ID): { Generic('class{}', DT.CLASS_ID): { Generic('image{}', DT.IMAGE_ID): [ Generic('{} {} {} {}', DT.X1, DT.X2, DT.Y1, DT.Y2) ] } }, 'classes': Pairing([ Generic('class{} {}', DT.CLASS_ID, DT.CLASS_NAME) ], DT.CLASS_ID, DT.CLASS_NAME) }) Notice the natural structure which is inherited. Each generic ends up distinct from each other, so the dataset is not ambiguous. A hierarchical structure would look as follows: .. code-block:: imageset1 class1 image1 1.0 2.0 3.0 4.0 5.0 6.0 7.0 8.0 image2 2.0 3.0 5.6 2.43 image3 5.4 12.4 543.2 12.3 2.0 3.0 5.6 2.44 2.0 3.0 5.6 2.46 2.0 3.0 5.6 2.48 class2 image4 32.54 21.4 32.43 12.23 image5 imageset2 class1 image6 32.54 21.4 32.43 12.256 classes class1 abc class2 def class3 ghi Notice that this is exactly the structure reflected in the above code used to parse the file. We can also specify an `ignore_type` such that any line which matches the Generic or string passed in is skipped. :param form: The form which matches the data to be read from `TXTFile`. :type form: dict[str | DataType | Static | Generic | Alias, Any] | list[Any] :param ignore_type: A list, or one value of Generic/str objects which if matched will ignore the line parsed. :type ignore_type: Optional[Union[list[Union[Generic, str]], Generic, str]] ''' def __init__( self, form: Union[dict[Union[str, DataType, Static, Generic, Alias], Any], list[Any]], ignore_type: Optional[Union[list[Union[Generic, str]], Generic, str]] = None ) -> None: self.form = form self.named = isinstance(form, dict) self.ignore_type: list[Generic] = [] if ignore_type: ignore_type = union(ignore_type) self.ignore_type = [Generic(rule + '{}', DataTypes.GENERIC) if isinstance(rule, str) else rule for rule in ignore_type]
[docs] def parse( self, path: str, curr_path: list[str] ) -> dict: from .._main._engine import expand_generics def filter_ignores(line: str): for ignore_type in self.ignore_type: if ignore_type.match(line)[0]: return True return False data = [] with open(path, 'r', encoding='utf-8') as f: lines = f.readlines() for line in lines: line = line.strip() if filter_ignores(line): continue data.append(line) data, _ = TXTFile._parse(data, self.form) return expand_generics( curr_path, data, self.form )
@staticmethod def _parse(data: list[str], form: Any) -> Any: if isinstance(form, Pairing): form = form.form if isinstance(form, (Generic, Static, str, DataType)): if isinstance(form, str): form = Static(form) elif isinstance(form, DataType): form = Generic('{}', form) if form.match(data[0]): return data, 1 raise ValueError("TXTFile Failed to parse") if isinstance(form, dict): return TXTFile._parse_dict(data, form) if isinstance(form, list): return TXTFile._parse_list(data, form) raise ValueError("Unknown Token") @staticmethod def _parse_list(data: list[str], form: list) -> list: parsed_data = [] ctr = 0 i = 0 while True: if ctr >= len(data): return parsed_data, ctr next_form = form[i] if isinstance(next_form, (list, dict)): obj_data, endline = TXTFile._parse(data[ctr:], form) parsed_data.append(obj_data) ctr += endline i = (i + 1) % len(form) continue if isinstance(next_form, str): next_form = Static(next_form) elif isinstance(next_form, DataType): next_form = Generic('{}', next_form) result, _ = next_form.match(data[ctr]) if not result: return parsed_data, ctr parsed_data.append(data[ctr]) ctr += 1 i = (i + 1) % len(form) @staticmethod def _parse_dict(data: list[str], form: dict) -> dict: cleaned_form: dict[Union[Static, Generic], Any] = {} for generic, subform in form.items(): if isinstance(generic, str): generic = Static(generic) elif isinstance(generic, DataType): generic = Generic('{}', generic) cleaned_form[generic] = subform form: dict[Union[Static, Generic], Any] = cleaned_form parsed_data = {} prev = -1 start = -1 for i, line in enumerate(data): result = False for generic in form: result, _ = generic.match(line) if result: break if not result: continue if start != -1: parsed_data[prev_line], _ = TXTFile._parse(data[start:i], form[key]) prev = start start = i + 1 key = generic prev_line = line if start != prev: parsed_data[prev_line], start = TXTFile._parse(data[start:], form[key]) else: start = 0 return parsed_data, start