from typing import Union, Optional, Any
from .._utils import load_config, union
from ..data.datatype import DataType
from ..data.datatypes import DataTypes
from ..parsing.static import Static
from ..parsing.generic import Generic
from ..parsing.alias import Alias
from .datafile import DataFile
from ..parsing.pairing import Pairing
config = load_config()
[docs]
class TXTFile(DataFile):
'''
The `TXTFile` class is an annotation object notator specifically for `.txt` file parsing. It
also can parse anything that is represented in plaintext, i.e. with UTF-8 encoding. It takes
a form similar to any nested dict structure, but it is also dangerous and should be noted
that distinct lines must take distinct forms for differentiation and disambiguation.
An example of a txt file that we want to parse:
.. code-block::
imageset1
class1
image1
1.0 2.0 3.0 4.0
5.0 6.0 7.0 8.0
image2
2.0 3.0 5.6 2.43
image3
5.4 12.4 543.2 12.3
2.0 3.0 5.6 2.44
2.0 3.0 5.6 2.46
2.0 3.0 5.6 2.48
class2
image4
32.54 21.4 32.43 12.23
image5
imageset2
class1
image6
32.54 21.4 32.43 12.256
classes
class1 abc
class2 def
class3 ghi
Observe that each line can be distinctly classified in a hierarchical sense. That is, each
individual line can be attributed to a single purpose.
.. code-block:: python
TXTFile({
Generic('imageset{}', DT.IMAGE_SET_ID): {
Generic('class{}', DT.CLASS_ID): {
Generic('image{}', DT.IMAGE_ID): [
Generic('{} {} {} {}', DT.X1, DT.X2, DT.Y1, DT.Y2)
]
}
},
'classes': Pairing([
Generic('class{} {}', DT.CLASS_ID, DT.CLASS_NAME)
], DT.CLASS_ID, DT.CLASS_NAME)
})
Notice the natural structure which is inherited. Each generic ends up distinct from each other,
so the dataset is not ambiguous. A hierarchical structure would look as follows:
.. code-block::
imageset1
class1
image1
1.0 2.0 3.0 4.0
5.0 6.0 7.0 8.0
image2
2.0 3.0 5.6 2.43
image3
5.4 12.4 543.2 12.3
2.0 3.0 5.6 2.44
2.0 3.0 5.6 2.46
2.0 3.0 5.6 2.48
class2
image4
32.54 21.4 32.43 12.23
image5
imageset2
class1
image6
32.54 21.4 32.43 12.256
classes
class1 abc
class2 def
class3 ghi
Notice that this is exactly the structure reflected in the above code used to parse the file.
We can also specify an `ignore_type` such that any line which matches the Generic or string
passed in is skipped.
:param form: The form which matches the data to be read from `TXTFile`.
:type form: dict[str | DataType | Static | Generic | Alias, Any] | list[Any]
:param ignore_type: A list, or one value of Generic/str objects which if matched will ignore
the line parsed.
:type ignore_type: Optional[Union[list[Union[Generic, str]], Generic, str]]
'''
def __init__(
self,
form: Union[dict[Union[str, DataType, Static, Generic, Alias], Any], list[Any]],
ignore_type: Optional[Union[list[Union[Generic, str]], Generic, str]] = None
) -> None:
self.form = form
self.named = isinstance(form, dict)
self.ignore_type: list[Generic] = []
if ignore_type:
ignore_type = union(ignore_type)
self.ignore_type = [Generic(rule + '{}', DataTypes.GENERIC) if
isinstance(rule, str) else rule for rule in ignore_type]
[docs]
def parse(
self,
path: str,
curr_path: list[str]
) -> dict:
from .._main._engine import expand_generics
def filter_ignores(line: str):
for ignore_type in self.ignore_type:
if ignore_type.match(line)[0]:
return True
return False
data = []
with open(path, 'r', encoding='utf-8') as f:
lines = f.readlines()
for line in lines:
line = line.strip()
if filter_ignores(line):
continue
data.append(line)
data, _ = TXTFile._parse(data, self.form)
return expand_generics(
curr_path,
data,
self.form
)
@staticmethod
def _parse(data: list[str], form: Any) -> Any:
if isinstance(form, Pairing):
form = form.form
if isinstance(form, (Generic, Static, str, DataType)):
if isinstance(form, str):
form = Static(form)
elif isinstance(form, DataType):
form = Generic('{}', form)
if form.match(data[0]):
return data, 1
raise ValueError("TXTFile Failed to parse")
if isinstance(form, dict):
return TXTFile._parse_dict(data, form)
if isinstance(form, list):
return TXTFile._parse_list(data, form)
raise ValueError("Unknown Token")
@staticmethod
def _parse_list(data: list[str], form: list) -> list:
parsed_data = []
ctr = 0
i = 0
while True:
if ctr >= len(data):
return parsed_data, ctr
next_form = form[i]
if isinstance(next_form, (list, dict)):
obj_data, endline = TXTFile._parse(data[ctr:], form)
parsed_data.append(obj_data)
ctr += endline
i = (i + 1) % len(form)
continue
if isinstance(next_form, str):
next_form = Static(next_form)
elif isinstance(next_form, DataType):
next_form = Generic('{}', next_form)
result, _ = next_form.match(data[ctr])
if not result:
return parsed_data, ctr
parsed_data.append(data[ctr])
ctr += 1
i = (i + 1) % len(form)
@staticmethod
def _parse_dict(data: list[str], form: dict) -> dict:
cleaned_form: dict[Union[Static, Generic], Any] = {}
for generic, subform in form.items():
if isinstance(generic, str):
generic = Static(generic)
elif isinstance(generic, DataType):
generic = Generic('{}', generic)
cleaned_form[generic] = subform
form: dict[Union[Static, Generic], Any] = cleaned_form
parsed_data = {}
prev = -1
start = -1
for i, line in enumerate(data):
result = False
for generic in form:
result, _ = generic.match(line)
if result:
break
if not result:
continue
if start != -1:
parsed_data[prev_line], _ = TXTFile._parse(data[start:i], form[key])
prev = start
start = i + 1
key = generic
prev_line = line
if start != prev:
parsed_data[prev_line], start = TXTFile._parse(data[start:], form[key])
else:
start = 0
return parsed_data, start