Skip to content

PR: Add metadata attribute to DataFrame and Column #43

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Closed
Closed
Show file tree
Hide file tree
Changes from 1 commit
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
14 changes: 14 additions & 0 deletions protocol/dataframe_protocol.py
Original file line number Diff line number Diff line change
Expand Up @@ -285,6 +285,13 @@ def null_count(self) -> Optional[int]:
"""
pass

@property
def metadata(self) -> Dict[str, Any]:
"""
Store the metadata specific to the column.
"""
pass

def num_chunks(self) -> int:
"""
Return the number of chunks the column consists of.
Expand Down Expand Up @@ -350,6 +357,13 @@ def __dataframe__(self, nan_as_null : bool = False) -> dict:
"version": 0 # Version number of the protocol
}

@property
def metadata(self) -> Dict[str, Any]:
"""
Store the metadata specific to the DataFrame
"""
pass

def num_columns(self) -> int:
"""
Return the number of columns in the DataFrame
Expand Down
33 changes: 32 additions & 1 deletion protocol/pandas_implementation.py
Original file line number Diff line number Diff line change
Expand Up @@ -426,6 +426,13 @@ def null_count(self) -> int:
"""
return self._col.isna().sum()

@property
def metadata(self) -> Dict[str, Any]:
"""
Store specific metadata of the column.
"""
return {"num_chunks": self.num_chunks()}

def num_chunks(self) -> int:
"""
Return the number of chunks the column consists of.
Expand Down Expand Up @@ -495,6 +502,11 @@ def __init__(self, df : pd.DataFrame, nan_as_null : bool = False) -> None:
# dtypes is added, this value should be propagated to columns.
self._nan_as_null = nan_as_null

@property
def metadata(self):
return {"num_chunks": self.num_chunks(),
"num_columns": self.num_columns()}

def num_columns(self) -> int:
return len(self._df.columns)

Expand Down Expand Up @@ -578,9 +590,28 @@ def test_categorical_dtype():
tm.assert_frame_equal(df, df2)


def test_metadata():
df = pd.DataFrame(data=dict(a=[1, 2, 3], b=[4, 5, 6], c=[7, 8, 9]))

# Check the metadata from the dataframe
df_metadata = df.__dataframe__().metadata
excpected = {"num_chunks": 1, "num_columns": 3}
for key in df_metadata:
assert df_metadata[key] == excpected[key]

# Check the metadata from the column
col_metadata = df.__dataframe__().get_column(0).metadata
expected = {"num_chunks": 1}
for key in col_metadata:
assert col_metadata[key] == excpected[key]

df2 = from_dataframe(df)
tm.assert_frame_equal(df, df2)


if __name__ == '__main__':
test_categorical_dtype()
test_float_only()
test_mixed_intfloat()
test_noncontiguous_columns()

test_metadata()