diff --git a/Makefile b/Makefile index 296c20f..2fe943e 100644 --- a/Makefile +++ b/Makefile @@ -8,5 +8,5 @@ install: check_prereqs python3 -m pip install -e '.[dev]' test: install - pylint singer -d missing-docstring,broad-except,bare-except,too-many-return-statements,too-many-branches,too-many-arguments,no-else-return,too-few-public-methods,fixme,protected-access + pylint singer --extension-pkg-whitelist=ciso8601 -d missing-docstring,broad-except,bare-except,too-many-return-statements,too-many-branches,too-many-arguments,no-else-return,too-few-public-methods,fixme,protected-access nosetests --with-doctest -v diff --git a/singer/transform.py b/singer/transform.py index c165e38..f0b8556 100644 --- a/singer/transform.py +++ b/singer/transform.py @@ -36,6 +36,16 @@ def unix_seconds_to_datetime(value): return strftime(datetime.datetime.fromtimestamp(int(value), datetime.timezone.utc)) +def breadcrumb_path(breadcrumb): + """ + Transform breadcrumb into familiar object dot-notation + """ + name = ".".join(breadcrumb) + name = name.replace('properties.', '') + name = name.replace('.items', '[]') + return name + + class SchemaMismatch(Exception): def __init__(self, errors): if not errors: @@ -46,7 +56,7 @@ def __init__(self, errors): msg = "Errors during transform\n\t{}".format("\n\t".join(estrs)) msg += "\n\n\nErrors during transform: [{}]".format(", ".join(estrs)) - super(SchemaMismatch, self).__init__(msg) + super().__init__(msg) class SchemaKey: ref = "$ref" @@ -110,25 +120,27 @@ def __enter__(self): def __exit__(self, *args): self.log_warning() - def filter_data_by_metadata(self, data, metadata): + def filter_data_by_metadata(self, data, metadata, parent=()): if isinstance(data, dict) and metadata: for field_name in list(data.keys()): - selected = singer.metadata.get(metadata, ('properties', field_name), 'selected') - inclusion = singer.metadata.get(metadata, ('properties', field_name), 'inclusion') + breadcrumb = parent + ('properties', field_name) + selected = singer.metadata.get(metadata, breadcrumb, 'selected') + inclusion = singer.metadata.get(metadata, breadcrumb, 'inclusion') if inclusion == 'automatic': continue - if selected is False: + if (selected is False) or (inclusion == 'unsupported'): data.pop(field_name, None) # Track that a field was filtered because the customer - # didn't select it. - self.filtered.add(field_name) + # didn't select it or the tap declared it as unsupported. + self.filtered.add(breadcrumb_path(breadcrumb)) + else: + data[field_name] = self.filter_data_by_metadata( + data[field_name], metadata, breadcrumb) - if inclusion == 'unsupported': - data.pop(field_name, None) - # Track that the field was filtered because the tap - # declared it as unsupported. - self.filtered.add(field_name) + if isinstance(data, list) and metadata: + breadcrumb = parent + ('items',) + data = [self.filter_data_by_metadata(d, metadata, breadcrumb) for d in data] return data diff --git a/tests/test_transform.py b/tests/test_transform.py index 3ba57fa..c6861ef 100644 --- a/tests/test_transform.py +++ b/tests/test_transform.py @@ -295,6 +295,49 @@ def test_drops_fields_which_are_unsupported(self): dict_value = {"name": "chicken"} self.assertEqual({}, transform(dict_value, schema, NO_INTEGER_DATETIME_PARSING, metadata=metadata)) + def test_drops_nested_object_fields_which_are_unselected(self): + schema = {"type": "object", + "properties": {"addr": {"type": "object", + "properties": {"addr1": {"type": "string"}, + "city": {"type": "string"}, + "state": {"type": "string"}, + 'amount': {'type': 'integer'}}}}} + metadata = { + ('properties','addr'): {"selected": True}, + ('properties','addr', 'properties','amount'): {"selected": False} + } + data = {'addr': + {'addr1': 'address_1', 'city': 'city_1', 'state': 'state_1', 'amount': '123'} + } + expected = {'addr': + {'addr1': 'address_1', 'city': 'city_1', 'state': 'state_1'}, + } + self.assertDictEqual(expected, transform(data, schema, NO_INTEGER_DATETIME_PARSING, metadata=metadata)) + + def test_drops_nested_array_fields_which_are_unselected(self): + schema = {"type": "object", + "properties": {"addrs": {"type": "array", + "items": {"type": "object", + "properties": {"addr1": {"type": "string"}, + "city": {"type": "string"}, + "state": {"type": "string"}, + 'amount': {'type': 'integer'}}}}}} + metadata = { + ('properties','addrs'): {"selected": True}, + ('properties','addrs','items','properties','amount'): {"selected": False} + } + data = {'addrs': [ + {'addr1': 'address_1', 'city': 'city_1', 'state': 'state_1', 'amount': '123'}, + {'addr1': 'address_2', 'city': 'city_2', 'state': 'state_2', 'amount': '456'} + ] + } + expected = {'addrs': [ + {'addr1': 'address_1', 'city': 'city_1', 'state': 'state_1'}, + {'addr1': 'address_2', 'city': 'city_2', 'state': 'state_2'} + ] + } + self.assertDictEqual(expected, transform(data, schema, NO_INTEGER_DATETIME_PARSING, metadata=metadata)) + class TestResolveSchemaReferences(unittest.TestCase): def test_internal_refs_resolve(self): schema = {"type": "object",