Declarative parser for structured data files.
- Getting started
- Usage
- API Reference
- File types
- Types
- Computed fields
- Return types
- Error handling
- License
๐ We recommend you to take time to read this README entirely before doing anything to have a good overview of what you are going to do and avoid to wrongly anticipate any steps.
After this point we expect you to have mise installed on your machine.
Use mise to install Python and poetry
mise install
# You can then test that they are available
python --version
poetry --versionInstall the python dev dependencies and setup pre-commit hooks
poetry install
pre-commit installimport magicparse
schema = {
"file_type": "csv",
"has_header": False,
"delimiter": ";",
"fields": [
{
"key": "ean",
"column-number": 2,
"type": "str",
"validators": [
{
"name": "regex-matches",
"parameters": {"pattern": "^\\d{13}$"},
}
],
},
{"key": "label", "column-number": 3, "type": "str"},
{"key": "family-code", "column-number": 8, "type": "str"},
{
"key": "vat",
"column-number": 10,
"type": "decimal",
"optional": False,
},
{
"key": "initial-price",
"column-number": 11,
"type": "decimal",
"post-processors": [
{
"name": "divide",
"parameters": {"denominator": 100},
},
{
"name": "round",
"parameters": {"precision": 3},
}
]
},
{
"key": "unit-of-measurement",
"column-number": 12,
"type": "int",
"pre-processors": [
{
"name": "map",
"parameters": {"values": {"K": 0, "A": 1, "L": 2}},
}
],
}
],
"computed-fields": [
{
"key": "code",
"type": "str",
"builder": {
"name": "concat",
"parameters": {"fields": ["code_1", "code_2"]},
}
},
{
"key": "volume",
"type": "decimal",
"builder": {
"name": "divide",
"parameters": {
"numerator": "price",
"denominator": "price_by_unit",
},
}
},
{
"key": "price_by_unit",
"type": "decimal",
"builder": {
"name": "multiply",
"parameters": {
"x_factor": "price",
"y_factor": "unit",
}
}
}
],
}
rows = magicparse.parse(data="...", schema=schema)from uuid import UUID
import magicparse
class GuidConverter(magicparse.TypeConverter):
@staticmethod
def key() -> str:
return "guid"
def apply(self, value):
return UUID(value)
magicparse.register(GuidConverter)
schema = {
"file_type": "csv",
"fields": [
{"key": "shop-guid", "type": "guid", "column-number": 1}
],
}
rows = magicparse.parse("13ec10cc-cc7e-4ee9-b091-9caa6d11aeb2", schema)
assert rows == [{"shop-guid": "13ec10cc-cc7e-4ee9-b091-9caa6d11aeb2"}]import magicparse
class PipedSchema(magicparse.Schema):
@staticmethod
def key() -> str:
return "piped"
def get_reader(self, stream):
for item in stream.read().split("|"):
yield [item]
magicparse.register(PipedSchema)
schema = {
"file_type": "piped",
"fields": [
{"key": "name", "type": "str", "column-number": 1}
]
}
rows = magicparse.parse("Joe|William|Jack|Averell", schema)
assert rows == [{"name": "Joe"}, {"name": "William"}, {"name": "Jack"}, {"name": "Averell"}]For large files, you can use streaming to process data incrementally:
import magicparse
schema = {
"file_type": "csv",
"fields": [
{"key": "name", "type": "str", "column-number": 1}
]
}
# Process data in chunks
for row in magicparse.stream_parse(data="...", schema=schema):
match row:
case magicparse.RowParsed(values):
print(f"The values {values}.")
case magicparse.RowFailed(errors):
print(f"The errors {errors}.")
case magicparse.RowSkipped(reason):
print(f"The errors {errors}.")
case _:
print("Unknown type of row.")By default, magicparse uses UTF-8 encoding. You can specify a different encoding:
schema = {
"file_type": "csv",
"encoding": "iso8859_5", # or any other encoding
"fields": [
{"key": "name", "type": "str", "column-number": 1}
]
}- CSV (with or without header)
- Columnar
- str
- int
- decimal
- datetime (timezone aware)
- time (timezone aware)
- left-pad-zeroes
- map
- regex-extract
- replace
- strip-whitespaces
- left-strip
- regex-matches
- greater-than
- not-null-or-empty
- divide
- round
Types, Pre-processors, Post-processors and validator is same as Field
- concat
- divide
- multiply
- coalesce
The parser returns a list of row objects:
RowParsed: Successfully parsed row withvaluesdictRowFailed: Failed to parse row witherrorsmessageRowSkipped: Skipped row witherrorsmessage
You can configure error handling for types, validators, and processors:
{
"key": "price",
"type": {
"key": "decimal",
"nullable": True, # Allow null values
"on-error": "skip-row" # Skip on error instead of raising
}
}Error handling options:
"raise"(default): Raise exception on error"skip-row": Skip the row and continue processing
This project is licensed under the MIT License.