From 160f6b96270605a4d17b527d9ddcb4879a296d05 Mon Sep 17 00:00:00 2001 From: Chris Goddard Date: Wed, 20 Feb 2019 16:50:53 -0800 Subject: [PATCH 1/9] Support filtering of nested fields Update filter_data_by_metadata function to allow filtering of nested fields - e.g. if property `address` has selected set to True, but property `address.street` has selected set to False, only the street would be excluded. Processes data recursively. --- singer/transform.py | 29 +++++++++++++++++++---------- 1 file changed, 19 insertions(+), 10 deletions(-) diff --git a/singer/transform.py b/singer/transform.py index 82888c2..0ead203 100644 --- a/singer/transform.py +++ b/singer/transform.py @@ -35,6 +35,13 @@ def unix_seconds_to_datetime(value): return strftime(datetime.datetime.fromtimestamp(int(value), datetime.timezone.utc)) +def breadcrumb_path(breadcrumb): + name = ".".join(breadcrumb) + name = name.replace('properties.', '') + name = name.replace('.items', '[]') + return name + + class SchemaMismatch(Exception): def __init__(self, errors): if not errors: @@ -101,21 +108,23 @@ def __enter__(self): def __exit__(self, *args): self.log_warning() - def filter_data_by_metadata(self, data, metadata): + def filter_data_by_metadata(self, data, metadata, parent=()): if isinstance(data, dict) and metadata: - for field_name in list(data.keys()): - selected = singer.metadata.get(metadata, ('properties', field_name), 'selected') - inclusion = singer.metadata.get(metadata, ('properties', field_name), 'inclusion') + for field_name, field_data in data.items(): + breadcrumb = parent + ('properties', field_name) + selected = singer.metadata.get(metadata, breadcrumb, 'selected') + inclusion = singer.metadata.get(metadata, breadcrumb, 'inclusion') if inclusion == 'automatic': continue - if selected is False: - data.pop(field_name, None) - self.filtered.add(field_name) + if (selected is False) or (inclusion == 'unsupported'): + data[field_name] = None + self.filtered.add(breadcrumb_path(breadcrumb)) + + data[field_name] = self.filter_data_by_metadata(field_data, metadata, breadcrumb) - if inclusion == 'unsupported': - data.pop(field_name, None) - self.filtered.add(field_name) + if isinstance(data, list) and metadata: + data = [self.filter_data_by_metadata(d, metadata, parent) for d in data] return data From c76e52f94885e221359ab5b42061fe7b1fb2b888 Mon Sep 17 00:00:00 2001 From: Chris Goddard Date: Wed, 20 Feb 2019 16:52:32 -0800 Subject: [PATCH 2/9] Update transform.py make formatting a little clearer --- singer/transform.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/singer/transform.py b/singer/transform.py index 0ead203..e156034 100644 --- a/singer/transform.py +++ b/singer/transform.py @@ -124,7 +124,8 @@ def filter_data_by_metadata(self, data, metadata, parent=()): data[field_name] = self.filter_data_by_metadata(field_data, metadata, breadcrumb) if isinstance(data, list) and metadata: - data = [self.filter_data_by_metadata(d, metadata, parent) for d in data] + breadcrumb = parent + ('items', field_name) + data = [self.filter_data_by_metadata(d, metadata, breadcrumb) for d in data] return data From 68a8a82659ad295eed18de48d4bf0e0f37a94a46 Mon Sep 17 00:00:00 2001 From: Chris Goddard Date: Wed, 20 Feb 2019 16:54:41 -0800 Subject: [PATCH 3/9] Update transform.py Fix array type breadcrumb name --- singer/transform.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/singer/transform.py b/singer/transform.py index e156034..1747966 100644 --- a/singer/transform.py +++ b/singer/transform.py @@ -124,8 +124,8 @@ def filter_data_by_metadata(self, data, metadata, parent=()): data[field_name] = self.filter_data_by_metadata(field_data, metadata, breadcrumb) if isinstance(data, list) and metadata: - breadcrumb = parent + ('items', field_name) - data = [self.filter_data_by_metadata(d, metadata, breadcrumb) for d in data] + breadcrumb = parent + ('items',) + data = [self.filter_data_by_metadata(d, metadata, parent + ('items', )) for d in data] return data From 3547c58b0e898f6d21c5fc9cb43b7dec35a6ad24 Mon Sep 17 00:00:00 2001 From: Chris Goddard Date: Wed, 20 Feb 2019 16:55:29 -0800 Subject: [PATCH 4/9] Update transform.py breadcrumb path documentation --- singer/transform.py | 3 +++ 1 file changed, 3 insertions(+) diff --git a/singer/transform.py b/singer/transform.py index 1747966..49a7989 100644 --- a/singer/transform.py +++ b/singer/transform.py @@ -36,6 +36,9 @@ def unix_seconds_to_datetime(value): def breadcrumb_path(breadcrumb): + """ + Transform breadcrumb into familiar object dot-notation + """ name = ".".join(breadcrumb) name = name.replace('properties.', '') name = name.replace('.items', '[]') From 9b7dd7837020af3fdb863d9d66e0c9a264a464ec Mon Sep 17 00:00:00 2001 From: Chris Goddard Date: Wed, 20 Feb 2019 17:16:35 -0800 Subject: [PATCH 5/9] Update transform.py change based on tests - must remove field from data object, not just set value to None. --- singer/transform.py | 8 ++++---- 1 file changed, 4 insertions(+), 4 deletions(-) diff --git a/singer/transform.py b/singer/transform.py index 49a7989..23bd33a 100644 --- a/singer/transform.py +++ b/singer/transform.py @@ -113,7 +113,7 @@ def __exit__(self, *args): def filter_data_by_metadata(self, data, metadata, parent=()): if isinstance(data, dict) and metadata: - for field_name, field_data in data.items(): + for field_name in list(data.keys()): breadcrumb = parent + ('properties', field_name) selected = singer.metadata.get(metadata, breadcrumb, 'selected') inclusion = singer.metadata.get(metadata, breadcrumb, 'inclusion') @@ -121,10 +121,10 @@ def filter_data_by_metadata(self, data, metadata, parent=()): continue if (selected is False) or (inclusion == 'unsupported'): - data[field_name] = None + data.pop(field_name, None) self.filtered.add(breadcrumb_path(breadcrumb)) - - data[field_name] = self.filter_data_by_metadata(field_data, metadata, breadcrumb) + else: + data[field_name] = self.filter_data_by_metadata(data[field_name], metadata, breadcrumb) if isinstance(data, list) and metadata: breadcrumb = parent + ('items',) From acbe4847087d5f9061440fd6cebf75c88d875c83 Mon Sep 17 00:00:00 2001 From: Chris Goddard Date: Wed, 20 Feb 2019 17:35:19 -0800 Subject: [PATCH 6/9] Update transform.py line lenght :) --- singer/transform.py | 3 ++- 1 file changed, 2 insertions(+), 1 deletion(-) diff --git a/singer/transform.py b/singer/transform.py index 23bd33a..49dd00e 100644 --- a/singer/transform.py +++ b/singer/transform.py @@ -124,7 +124,8 @@ def filter_data_by_metadata(self, data, metadata, parent=()): data.pop(field_name, None) self.filtered.add(breadcrumb_path(breadcrumb)) else: - data[field_name] = self.filter_data_by_metadata(data[field_name], metadata, breadcrumb) + data[field_name] = self.filter_data_by_metadata( + data[field_name], metadata, breadcrumb) if isinstance(data, list) and metadata: breadcrumb = parent + ('items',) From 514f8cb18d2a62a9e16dcd206b6e31faa599432e Mon Sep 17 00:00:00 2001 From: Jude188 <17158624+Jude188@users.noreply.github.com> Date: Tue, 27 Oct 2020 09:55:43 +0000 Subject: [PATCH 7/9] Add tests for filtering nested fields --- tests/test_transform.py | 43 +++++++++++++++++++++++++++++++++++++++++ 1 file changed, 43 insertions(+) diff --git a/tests/test_transform.py b/tests/test_transform.py index 3ba57fa..c6861ef 100644 --- a/tests/test_transform.py +++ b/tests/test_transform.py @@ -295,6 +295,49 @@ def test_drops_fields_which_are_unsupported(self): dict_value = {"name": "chicken"} self.assertEqual({}, transform(dict_value, schema, NO_INTEGER_DATETIME_PARSING, metadata=metadata)) + def test_drops_nested_object_fields_which_are_unselected(self): + schema = {"type": "object", + "properties": {"addr": {"type": "object", + "properties": {"addr1": {"type": "string"}, + "city": {"type": "string"}, + "state": {"type": "string"}, + 'amount': {'type': 'integer'}}}}} + metadata = { + ('properties','addr'): {"selected": True}, + ('properties','addr', 'properties','amount'): {"selected": False} + } + data = {'addr': + {'addr1': 'address_1', 'city': 'city_1', 'state': 'state_1', 'amount': '123'} + } + expected = {'addr': + {'addr1': 'address_1', 'city': 'city_1', 'state': 'state_1'}, + } + self.assertDictEqual(expected, transform(data, schema, NO_INTEGER_DATETIME_PARSING, metadata=metadata)) + + def test_drops_nested_array_fields_which_are_unselected(self): + schema = {"type": "object", + "properties": {"addrs": {"type": "array", + "items": {"type": "object", + "properties": {"addr1": {"type": "string"}, + "city": {"type": "string"}, + "state": {"type": "string"}, + 'amount': {'type': 'integer'}}}}}} + metadata = { + ('properties','addrs'): {"selected": True}, + ('properties','addrs','items','properties','amount'): {"selected": False} + } + data = {'addrs': [ + {'addr1': 'address_1', 'city': 'city_1', 'state': 'state_1', 'amount': '123'}, + {'addr1': 'address_2', 'city': 'city_2', 'state': 'state_2', 'amount': '456'} + ] + } + expected = {'addrs': [ + {'addr1': 'address_1', 'city': 'city_1', 'state': 'state_1'}, + {'addr1': 'address_2', 'city': 'city_2', 'state': 'state_2'} + ] + } + self.assertDictEqual(expected, transform(data, schema, NO_INTEGER_DATETIME_PARSING, metadata=metadata)) + class TestResolveSchemaReferences(unittest.TestCase): def test_internal_refs_resolve(self): schema = {"type": "object", From 58520808d904125c4d5ad5d470633d9baa3788f7 Mon Sep 17 00:00:00 2001 From: Jude188 <17158624+Jude188@users.noreply.github.com> Date: Tue, 27 Oct 2020 10:15:37 +0000 Subject: [PATCH 8/9] Make pylint happy --- Makefile | 2 +- singer/transform.py | 2 +- 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/Makefile b/Makefile index 296c20f..2fe943e 100644 --- a/Makefile +++ b/Makefile @@ -8,5 +8,5 @@ install: check_prereqs python3 -m pip install -e '.[dev]' test: install - pylint singer -d missing-docstring,broad-except,bare-except,too-many-return-statements,too-many-branches,too-many-arguments,no-else-return,too-few-public-methods,fixme,protected-access + pylint singer --extension-pkg-whitelist=ciso8601 -d missing-docstring,broad-except,bare-except,too-many-return-statements,too-many-branches,too-many-arguments,no-else-return,too-few-public-methods,fixme,protected-access nosetests --with-doctest -v diff --git a/singer/transform.py b/singer/transform.py index b6d9871..f3fd4d5 100644 --- a/singer/transform.py +++ b/singer/transform.py @@ -56,7 +56,7 @@ def __init__(self, errors): msg = "Errors during transform\n\t{}".format("\n\t".join(estrs)) msg += "\n\n\nErrors during transform: [{}]".format(", ".join(estrs)) - super(SchemaMismatch, self).__init__(msg) + super().__init__(msg) class SchemaKey: ref = "$ref" From b3d7d70f44700ea894cb5630bfbae773da3bd55a Mon Sep 17 00:00:00 2001 From: Jude188 <17158624+Jude188@users.noreply.github.com> Date: Tue, 27 Oct 2020 10:30:03 +0000 Subject: [PATCH 9/9] Simplify one line --- singer/transform.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/singer/transform.py b/singer/transform.py index f3fd4d5..f0b8556 100644 --- a/singer/transform.py +++ b/singer/transform.py @@ -140,7 +140,7 @@ def filter_data_by_metadata(self, data, metadata, parent=()): if isinstance(data, list) and metadata: breadcrumb = parent + ('items',) - data = [self.filter_data_by_metadata(d, metadata, parent + ('items', )) for d in data] + data = [self.filter_data_by_metadata(d, metadata, breadcrumb) for d in data] return data