Skip to content

Add getting learner's data as pandas.DataFrame; add learner.to_dataframe method #358

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Merged
merged 32 commits into from
Sep 19, 2022
Merged
Show file tree
Hide file tree
Changes from all commits
Commits
Show all changes
32 commits
Select commit Hold shift + click to select a range
ffd8c04
Add getting learnerto pandas; add learner.as_dataframe
basnijholt Sep 6, 2022
9cc0e8c
Implement assign_defaults
basnijholt Sep 13, 2022
a1dcefd
Add LearnerND.to_dataframe
basnijholt Sep 13, 2022
db12d2f
Add BalancingLearner.to_dataframe
basnijholt Sep 13, 2022
dba35e0
Use **kwargs in BalancingLearner.to_dataframe
basnijholt Sep 13, 2022
777100c
Fix point_names type-hint
basnijholt Sep 13, 2022
fffc8f3
Rename default_parameters -> _default_parameters
basnijholt Sep 13, 2022
191f781
Add test_to_dataframe
basnijholt Sep 13, 2022
178a497
Write tests where we readd the data
basnijholt Sep 13, 2022
de380d5
Add SequenceLearner.to_dataframe
basnijholt Sep 13, 2022
25286f6
use future annotations
basnijholt Sep 13, 2022
f86780f
Add learner.load_dataframe
basnijholt Sep 13, 2022
fcab901
Improve test and typing
basnijholt Sep 13, 2022
84a0af6
Pass kw to learner.to_dataframe
basnijholt Sep 13, 2022
998c06c
Fix typeguard issues
basnijholt Sep 13, 2022
6e24fd4
Set mean=False by default
basnijholt Sep 13, 2022
78d4def
Add a section about the dataframes loading to example-notebook.ipynb
basnijholt Sep 13, 2022
9b95c20
Add data exporting section to the first tutorial
basnijholt Sep 13, 2022
eb9c6e3
Add pandas to the doc environment
basnijholt Sep 13, 2022
6d0f23d
Use the correct function
basnijholt Sep 13, 2022
82a3cb5
Add doc-strings to to_dataframe
basnijholt Sep 13, 2022
68055fe
Add BalancingLearner.load_dataframe
basnijholt Sep 13, 2022
2a2e43d
Add doc-strings to load_dataframe
basnijholt Sep 13, 2022
2437659
Fix doc-string in SequenceLearner
basnijholt Sep 14, 2022
d859c45
Fix DataFrame in darkmode
basnijholt Sep 14, 2022
75472f0
Use _set_data in Learner2D.load_dataframe
basnijholt Sep 14, 2022
222b84f
Raise ValueError when function_prefix is empty
basnijholt Sep 14, 2022
02362b9
Implement DataSaver.to_dataframe
basnijholt Sep 14, 2022
b88b803
Implement DataSaver.load_dataframe
basnijholt Sep 14, 2022
e46daf0
Test for attrs
basnijholt Sep 14, 2022
cfc2ed4
Use future annotations
basnijholt Sep 14, 2022
e091772
Remove start_index from function
basnijholt Sep 19, 2022
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
92 changes: 89 additions & 3 deletions adaptive/learner/average_learner.py
Original file line number Diff line number Diff line change
Expand Up @@ -8,8 +8,20 @@

from adaptive.learner.base_learner import BaseLearner
from adaptive.notebook_integration import ensure_holoviews
from adaptive.types import Float, Real
from adaptive.utils import cache_latest
from adaptive.types import Float, Int, Real
from adaptive.utils import (
assign_defaults,
cache_latest,
partial_function_from_dataframe,
)

try:
import pandas

with_pandas = True

except ModuleNotFoundError:
with_pandas = False


class AverageLearner(BaseLearner):
Expand Down Expand Up @@ -70,6 +82,80 @@ def to_numpy(self):
"""Data as NumPy array of size (npoints, 2) with seeds and values."""
return np.array(sorted(self.data.items()))

def to_dataframe(
self,
with_default_function_args: bool = True,
function_prefix: str = "function.",
seed_name: str = "seed",
y_name: str = "y",
) -> pandas.DataFrame:
"""Return the data as a `pandas.DataFrame`.

Parameters
----------
with_default_function_args : bool, optional
Include the ``learner.function``'s default arguments as a
column, by default True
function_prefix : str, optional
Prefix to the ``learner.function``'s default arguments' names,
by default "function."
seed_name : str, optional
Name of the ``seed`` parameter, by default "seed"
y_name : str, optional
Name of the output value, by default "y"

Returns
-------
pandas.DataFrame

Raises
------
ImportError
If `pandas` is not installed.
"""
if not with_pandas:
raise ImportError("pandas is not installed.")
df = pandas.DataFrame(sorted(self.data.items()), columns=[seed_name, y_name])
df.attrs["inputs"] = [seed_name]
df.attrs["output"] = y_name
if with_default_function_args:
assign_defaults(self.function, df, function_prefix)
return df

def load_dataframe(
self,
df: pandas.DataFrame,
with_default_function_args: bool = True,
function_prefix: str = "function.",
seed_name: str = "seed",
y_name: str = "y",
):
"""Load data from a `pandas.DataFrame`.

If ``with_default_function_args`` is True, then ``learner.function``'s
default arguments are set (using `functools.partial`) from the values
in the `pandas.DataFrame`.

Parameters
----------
df : pandas.DataFrame
The data to load.
with_default_function_args : bool, optional
The ``with_default_function_args`` used in ``to_dataframe()``,
by default True
function_prefix : str, optional
The ``function_prefix`` used in ``to_dataframe``, by default "function."
seed_name : str, optional
The ``seed_name`` used in ``to_dataframe``, by default "seed"
y_name : str, optional
The ``y_name`` used in ``to_dataframe``, by default "y"
"""
self.tell_many(df[seed_name].values, df[y_name].values)
if with_default_function_args:
self.function = partial_function_from_dataframe(
self.function, df, function_prefix
)

def ask(self, n: int, tell_pending: bool = True) -> tuple[list[int], list[Float]]:
points = list(range(self.n_requested, self.n_requested + n))

Expand All @@ -87,7 +173,7 @@ def ask(self, n: int, tell_pending: bool = True) -> tuple[list[int], list[Float]
self.tell_pending(p)
return points, loss_improvements

def tell(self, n: int, value: Real) -> None:
def tell(self, n: Int, value: Real) -> None:
if n in self.data:
# The point has already been added before.
return
Expand Down
123 changes: 120 additions & 3 deletions adaptive/learner/average_learner1D.py
Original file line number Diff line number Diff line change
Expand Up @@ -14,7 +14,16 @@

from adaptive.learner.learner1D import Learner1D, _get_intervals
from adaptive.notebook_integration import ensure_holoviews
from adaptive.types import Real
from adaptive.types import Int, Real
from adaptive.utils import assign_defaults, partial_function_from_dataframe

try:
import pandas

with_pandas = True

except ModuleNotFoundError:
with_pandas = False

Point = Tuple[int, Real]
Points = List[Point]
Expand Down Expand Up @@ -127,6 +136,112 @@ def min_samples_per_point(self) -> int:
return 0
return min(self._number_samples.values())

def to_numpy(self, mean: bool = False) -> np.ndarray:
if mean:
return super().to_numpy()
else:
return np.array(
[
(seed, x, *np.atleast_1d(y))
for x, seed_y in self._data_samples.items()
for seed, y in seed_y.items()
]
)

def to_dataframe(
self,
mean: bool = False,
with_default_function_args: bool = True,
function_prefix: str = "function.",
seed_name: str = "seed",
x_name: str = "x",
y_name: str = "y",
) -> pandas.DataFrame:
"""Return the data as a `pandas.DataFrame`.

Parameters
----------
with_default_function_args : bool, optional
Include the ``learner.function``'s default arguments as a
column, by default True
function_prefix : str, optional
Prefix to the ``learner.function``'s default arguments' names,
by default "function."
seed_name : str, optional
Name of the ``seed`` parameter, by default "seed"
x_name : str, optional
Name of the ``x`` parameter, by default "x"
y_name : str, optional
Name of the output value, by default "y"

Returns
-------
pandas.DataFrame

Raises
------
ImportError
If `pandas` is not installed.
"""
if not with_pandas:
raise ImportError("pandas is not installed.")
if mean:
data = sorted(self.data.items())
columns = [x_name, y_name]
else:
data = [
(seed, x, y)
for x, seed_y in sorted(self._data_samples.items())
for seed, y in sorted(seed_y.items())
]
columns = [seed_name, x_name, y_name]
df = pandas.DataFrame(data, columns=columns)
df.attrs["inputs"] = [seed_name, x_name]
df.attrs["output"] = y_name
if with_default_function_args:
assign_defaults(self.function, df, function_prefix)
return df

def load_dataframe(
self,
df: pandas.DataFrame,
with_default_function_args: bool = True,
function_prefix: str = "function.",
seed_name: str = "seed",
x_name: str = "x",
y_name: str = "y",
):
"""Load data from a `pandas.DataFrame`.

If ``with_default_function_args`` is True, then ``learner.function``'s
default arguments are set (using `functools.partial`) from the values
in the `pandas.DataFrame`.

Parameters
----------
df : pandas.DataFrame
The data to load.
with_default_function_args : bool, optional
The ``with_default_function_args`` used in ``to_dataframe()``,
by default True
function_prefix : str, optional
The ``function_prefix`` used in ``to_dataframe``, by default "function."
seed_name : str, optional
The ``seed_name`` used in ``to_dataframe``, by default "seed"
x_name : str, optional
The ``x_name`` used in ``to_dataframe``, by default "x"
y_name : str, optional
The ``y_name`` used in ``to_dataframe``, by default "y"
"""
# Were using zip instead of df[[seed_name, x_name]].values because that will
# make the seeds into floats
seed_x = list(zip(df[seed_name].values.tolist(), df[x_name].values.tolist()))
self.tell_many(seed_x, df[y_name].values)
if with_default_function_args:
self.function = partial_function_from_dataframe(
self.function, df, function_prefix
)

def ask(self, n: int, tell_pending: bool = True) -> tuple[Points, list[float]]:
"""Return 'n' points that are expected to maximally reduce the loss."""
# If some point is undersampled, resample it
Expand Down Expand Up @@ -362,7 +477,9 @@ def _calc_error_in_mean(self, ys: Iterable[Real], y_avg: Real, n: int) -> float:
t_student = scipy.stats.t.ppf(1 - self.alpha, df=n - 1)
return t_student * (variance_in_mean / n) ** 0.5

def tell_many(self, xs: Points, ys: Sequence[Real]) -> None:
def tell_many(
self, xs: Points | np.ndarray, ys: Sequence[Real] | np.ndarray
) -> None:
# Check that all x are within the bounds
# TODO: remove this requirement, all other learners add the data
# but ignore it going forward.
Expand All @@ -373,7 +490,7 @@ def tell_many(self, xs: Points, ys: Sequence[Real]) -> None:
)

# Create a mapping of points to a list of samples
mapping: DefaultDict[Real, DefaultDict[int, Real]] = defaultdict(
mapping: DefaultDict[Real, DefaultDict[Int, Real]] = defaultdict(
lambda: defaultdict(dict)
)
for (seed, x), y in zip(xs, ys):
Expand Down
57 changes: 57 additions & 0 deletions adaptive/learner/balancing_learner.py
Original file line number Diff line number Diff line change
Expand Up @@ -11,6 +11,14 @@
from adaptive.notebook_integration import ensure_holoviews
from adaptive.utils import cache_latest, named_product, restore

try:
import pandas

with_pandas = True

except ModuleNotFoundError:
with_pandas = False


def dispatch(child_functions, arg):
index, x = arg
Expand Down Expand Up @@ -381,6 +389,55 @@ def from_product(cls, f, learner_type, learner_kwargs, combos):
learners.append(learner)
return cls(learners, cdims=arguments)

def to_dataframe(self, index_name: str = "learner_index", **kwargs):
"""Return the data as a concatenated `pandas.DataFrame` from child learners.

Parameters
----------
index_name : str, optional
The name of the index column indicating the learner index,
by default "learner_index".
**kwargs : dict
Keyword arguments passed to each ``child_learner.to_dataframe(**kwargs)``.

Returns
-------
pandas.DataFrame

Raises
------
ImportError
If `pandas` is not installed.
"""
if not with_pandas:
raise ImportError("pandas is not installed.")
dfs = []
for i, learner in enumerate(self.learners):
df = learner.to_dataframe(**kwargs)
cols = list(df.columns)
df[index_name] = i
df = df[[index_name] + cols]
dfs.append(df)
df = pandas.concat(dfs, axis=0, ignore_index=True)
return df

def load_dataframe(
self, df: pandas.DataFrame, index_name: str = "learner_index", **kwargs
):
"""Load the data from a `pandas.DataFrame` into the child learners.

Parameters
----------
df : pandas.DataFrame
DataFrame with the data to load.
index_name : str, optional
The ``index_name`` used in `to_dataframe`, by default "learner_index".
**kwargs : dict
Keyword arguments passed to each ``child_learner.load_dataframe(**kwargs)``.
"""
for i, gr in df.groupby(index_name):
self.learners[i].load_dataframe(gr, **kwargs)

def save(self, fname, compress=True):
"""Save the data of the child learners into pickle files
in a directory.
Expand Down
Loading