I found myself making a variant of pl.DataFrame
which keeps track of the operations performed on it. For example:
from pprint import pformat, pprint
import polars as pl
import polars._typing as plt
from collections import UserList
from dataclasses import dataclass, field
from typing import Any, Iterable, Optional, Self
from numpy import ndarray
@dataclass
class CalcMeta(UserList):
data: list[Any] = field(default_factory=list)
@dataclass
class CalcReport(UserList):
data: list[tuple[str, Any]] = field(default_factory=list, kw_only=True)
def append(self, **kwargs) -> None: # type: ignore
self.data += list(kwargs.items())
class CalcDataFrame(pl.DataFrame):
meta: CalcMeta
report: Optional[CalcReport] = None
def __init__(
self,
data: pl.DataFrame,
meta: CalcMeta = CalcMeta(),
report: Optional[CalcReport] = None,
):
super().__init__(data)
self.meta = meta
self.report = report
def filter(
self,
*predicates: pl.Expr
| pl.Series
| str
| Iterable[pl.Expr | pl.Series | str]
| bool
| list[bool]
| ndarray[Any, Any],
**constraints: Any,
) -> Self:
return self.append_report(
filtered_with={
"predicates": str(predicates),
"constraints": str(constraints),
}
).derive(super().filter(*predicates, **constraints))
def with_columns(
self,
*exprs: plt.IntoExpr | Iterable[plt.IntoExpr],
**named_exprs: plt.IntoExpr,
) -> Self:
return self.append_report(
with_columns={"exprs": str(exprs), "named_exprs": str(named_exprs)}
).derive(super().with_columns(*exprs, **named_exprs))
def append_report(self, **kwargs) -> Self:
if self.report is not None:
self.report.append(**kwargs)
return self
def derive(self, data: pl.DataFrame, meta: CalcMeta = CalcMeta()) -> Self:
return self.__class__(data, self.meta + meta, self.report)
xs = pl.DataFrame(
[
pl.Series("alpha", ["a", "b", "c"]),
pl.Series("beta", ["x", "x", "a"]),
pl.Series("xs", [0, 1, 2]),
]
)
xs.with_columns()
xs = CalcDataFrame(xs, meta=CalcMeta(["some meta data"]), report=CalcReport())
xs = xs.filter(pl.col("alpha").eq("a")).with_columns(
pl.col("beta").replace_strict({"x": "y"})
)
if xs.report:
for step in xs.report:
print(f"{step[0]}:")
print(f" {pformat(step[1])}")
print(xs)
filtered_with:
{'constraints': '{}',
'predicates': '(<Expr ['[(col("alpha")) == (String(a))…'] at '
'0x7F1258222300>,)'}
with_columns:
{'exprs': '(<Expr ['col("beta").replace_strict([Se…'] at 0x7F1258223680>,)',
'named_exprs': '{}'}
shape: (1, 3)
┌───────┬──────┬─────┐
│ alpha ┆ beta ┆ xs │
│ --- ┆ --- ┆ --- │
│ str ┆ str ┆ i64 │
╞═══════╪══════╪═════╡
│ a ┆ y ┆ 0 │
└───────┴──────┴─────┘
I stopped myself before I went too far for two reasons:
- Likely this is already functionality that exists? I found the following relevant information:
- Logging in Polars
- Polars show_graph method
- Some expressions don’t have a nice string representation off the bat:
import polars as pl
expr = pl.col("somecol").replace_strict(
{"hello": "world"},
return_dtype=pl.List(pl.Enum(pl.Series(["a", "b"]))),
)
print(expr)
col("somecol").replace_strict([Series, Series])
The trouble with show_graph
is that it doesn’t present output in a format that is useful to me (i.e. it uses notation which is meant to help polars
library authors).
Am I missing some obvious functionality that does what I want? If not: how can I pretty print expressions such as replace_strict
, so that the inner series etc. they are built on are also fully printed?
(Otherwise, I do have various ideas I can update this question with that let me capture what I need.)