Added Apache Arrow formats (fixes #1)

dhondta · dhondta · commit 5c030a40737f · 2026-02-04T21:36:56.000+01:00
diff --git a/.github/workflows/python-package.yml b/.github/workflows/python-package.yml
@@ -19,7 +19,7 @@ jobs:
       fail-fast: false
       matrix:
         os: [ubuntu-latest]
-        python-version: ["3.8", "3.9", "3.10", "3.11", "3.12", "3.13"]
+        python-version: ["3.10", "3.11", "3.12", "3.13"]
     steps:
     - uses: actions/checkout@v3
     - name: Set up Python ${{ matrix.python-version }}
diff --git a/docs/pages/index.md b/docs/pages/index.md
@@ -1,32 +1,38 @@
-# Introduction
-
-DSFF (DataSet File Format) is a tiny library relying on [`openpyxl`](https://pypi.org/project/openpyxl) that allows to store a dataset with its features for use with machine learning in an XSLX file whose structure is enforced. It is intended to make easy to store, edit and exchange a dataset.
-
-It is used with the [Packing Box](https://github.com/packing-box/docker-packing-box) to export datasets in a convenient format.
-
------
-
-## Setup
-
-This library is available on [PyPi](https://pypi.python.org/pypi/dsff/) and can be simply installed using Pip:
-
-```sh
-pip install --user dsff
-```
-
------
-
-## Format
-
-DSFF is straightforward and contains only the minimum for storing a dataset.
-
-The following document properties of the XSLX format are used:
-
-- `title`: this holds the name of the dataset
-- `description`: this holds a serialized dictionary of the metadata from the dataset
-
-An XSLX workbook format as a DSFF has two and only two worksheets:
-
-1. `data`: the matrix of the whole dataset (including headers), eventually containing samples' metadata but mostly the feature values
-2. `features`: the name-description pairs of each feature used in `data` (including two headers: `name` and `description`)
-
+# Introduction
+
+DSFF (DataSet File Format) is a tiny library relying on [`openpyxl`](https://pypi.org/project/openpyxl) that allows to store a dataset with its features for use with machine learning in an XSLX file whose structure is enforced. It is intended to make easy to store, edit and exchange a dataset.
+
+It is used with the [Packing Box](https://github.com/packing-box/docker-packing-box) to export datasets in a convenient format.
+
+-----
+
+## Setup
+
+This library is available on [PyPi](https://pypi.python.org/pypi/dsff/) and can be simply installed using Pip:
+
+```sh
+pip install --user dsff
+```
+
+If you want to use additional [Apache Arrow](https://arrow.apache.org/docs/index.html) formats, you can install [`pyarrow`](https://arrow.apache.org/docs/python/index.html) with the following command:
+
+```sh
+pip install --user dsff[extra]
+```
+
+-----
+
+## Format
+
+DSFF is straightforward and contains only the minimum for storing a dataset.
+
+The following document properties of the XSLX format are used:
+
+- `title`: this holds the name of the dataset
+- `description`: this holds a serialized dictionary of the metadata from the dataset
+
+An XSLX workbook format as a DSFF has two and only two worksheets:
+
+1. `data`: the matrix of the whole dataset (including headers), eventually containing samples' metadata but mostly the feature values
+2. `features`: the name-description pairs of each feature used in `data` (including two headers: `name` and `description`)
+
diff --git a/docs/pages/usage.md b/docs/pages/usage.md
@@ -82,3 +82,27 @@ Converting from other formats to DSFF |  Converting from DSFF to other formats
     f.to_dataset()  # creates ./[dsff-title] with data.csv, features.json and metadata.json
 ```
 
+**Creating a Feather dataset from a DSFF**
+
+```python
+>>> import dsff
+>>> with dsff.DSFF("/path/to/my-dataset.feather") as f:
+    f.to_feather()  # creates ./my-dataset.feather
+```
+
+**Creating an ORC dataset from a DSFF**
+
+```python
+>>> import dsff
+>>> with dsff.DSFF("/path/to/my-dataset.orc") as f:
+    f.to_orc()  # creates ./my-dataset.orc
+```
+
+**Creating a Parquet dataset from a DSFF**
+
+```python
+>>> import dsff
+>>> with dsff.DSFF("/path/to/my-dataset.parquet") as f:
+    f.to_parquet()  # creates ./my-dataset.parquet
+```
+
diff --git a/pyproject.toml b/pyproject.toml
@@ -16,7 +16,7 @@ authors = [
 description = "DataSet File Format (DSFF)"
 license = {file = "LICENSE"}
 keywords = ["python", "programming", "dataset-file-format", "dsff"]
-requires-python = ">=3.8,<4"
+requires-python = ">=3.10,<4"
 classifiers = [
   "Development Status :: 5 - Production/Stable",
   "Environment :: Console",
@@ -33,6 +33,11 @@ dependencies = [
 ]
 dynamic = ["version"]
 
+[project.optional-dependencies]
+extra = [
+  "pyarrow",
+]
+
 [project.readme]
 file = "README.md"
 content-type = "text/markdown"
diff --git a/pytest.ini b/pytest.ini
@@ -0,0 +1,2 @@
+[pytest]
+pythonpath = src
diff --git a/requirements.txt b/requirements.txt
@@ -1 +1,2 @@
 openpyxl
+pyarrow
diff --git a/src/dsff/VERSION.txt b/src/dsff/VERSION.txt
@@ -1 +1 @@
-1.1.0
+1.2.0
diff --git a/src/dsff/formats/pa.py b/src/dsff/formats/pa.py
@@ -0,0 +1,29 @@
+# -*- coding: UTF-8 -*-
+from .__common__ import *
+
+
+__all__ = []
+
+
+def _nowrite(m):
+    raise NotImplementedError(f"none of {m}.write_table and {m}.write_{m} is implemented")
+
+
+for module in ["feather", "orc", "parquet"]:
+    __all__ += [f"from_{module}", f"to_{module}"]
+    def gen_func(m):
+        def from_(dsff, path=None, exclude=DEFAULT_EXCL):
+            dataset = globals()[m].read_table(path)
+            dsff.write(data=[dataset.schema.names] + [list(r.values()) for r in dataset.to_pylist()],
+                       metadata=literal_eval(dataset.schema.metadata.pop(b'__metadata__', b"{}").decode()),
+                       features={k.decode(): v.decode() for k, v in dataset.schema.metadata.items()})
+        from_.__name__ = f"from_{m}"
+        def to_(dsff, path=None, text=False):
+            with (BytesIO() if text else open(path, 'wb+')) as f:
+                getattr(globals()[m], "write_table", getattr(globals()[m], f"write_{m}", _nowrite))(dsff._to_table(), f)
+                if text:
+                    return f.getvalue()
+        to_.__name__ = f"to_{m}"
+        return from_, to_
+    globals()[f'from_{module}'], globals()[f'to_{module}'] = gen_func(module)
+
diff --git a/tests/test_dsff.py b/tests/test_dsff.py
diff --git a/tests/utils.py b/tests/utils.py