Skip to content

[ENH] Fixes Issue Improve _check_params method in kmeans.py and kmedoids.py #2682

New issue

Have a question about this project? Sign up for a free GitHub account to open an issue and contact its maintainers and the community.

By clicking “Sign up for GitHub”, you agree to our terms of service and privacy statement. We’ll occasionally send you account related emails.

Already on GitHub? Sign in to your account

Open
wants to merge 11 commits into
base: main
Choose a base branch
from
16 changes: 10 additions & 6 deletions aeon/clustering/_k_means.py
Original file line number Diff line number Diff line change
Expand Up @@ -287,23 +287,27 @@ def _predict(self, X: np.ndarray, y=None) -> np.ndarray:
def _check_params(self, X: np.ndarray) -> None:
self._random_state = check_random_state(self.random_state)

_incorrect_init_str = (
f"The value provided for init: {self.init} is "
f"invalid. The following are a list of valid init algorithms "
f"strings: random, kmeans++, first. You can also pass a "
f"np.ndarray of size (n_clusters, n_channels, n_timepoints)"
)

if isinstance(self.init, str):
if self.init == "random":
self._init = self._random_center_initializer
elif self.init == "kmeans++":
self._init = self._kmeans_plus_plus_center_initializer
elif self.init == "first":
self._init = self._first_center_initializer
else:
raise ValueError(_incorrect_init_str)
else:
if isinstance(self.init, np.ndarray) and len(self.init) == self.n_clusters:
self._init = self.init.copy()
else:
raise ValueError(
f"The value provided for init: {self.init} is "
f"invalid. The following are a list of valid init algorithms "
f"strings: random, kmedoids++, first. You can also pass a"
f"np.ndarray of size (n_clusters, n_channels, n_timepoints)"
)
raise ValueError(_incorrect_init_str)

if self.distance_params is None:
self._distance_params = {}
Expand Down
24 changes: 16 additions & 8 deletions aeon/clustering/_k_medoids.py
Original file line number Diff line number Diff line change
Expand Up @@ -46,13 +46,17 @@ class TimeSeriesKMedoids(BaseClusterer):
The number of clusters to form as well as the number of centroids to generate.
init : str or np.ndarray, default='random'
Method for initialising cluster centers. Any of the following are valid:
['kmedoids++', 'random', 'first'].
['kmedoids++', 'random', 'first', 'build'].
Random is the default as it is very fast and it was found in [2] to
perform about as well as the other methods.
Kmedoids++ is a variant of kmeans++ [4] and is slower but often more
accurate than random. It works by choosing centroids that are distant
from one another. First is the fastest method and simply chooses the
first k time series as centroids.
first k time series as centroids. Build [1] greedily selects the k medoids
by first selecting the medoid that minimizes the sum of distances
to all other points(this point is the most centrally located) and then
iteratively selects the next k-1 medoids that maximizes the decrease in sum
of distances of all other points to their respective medoids selected so far.
Comment on lines +55 to +59
Copy link
Member

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this the original method used in the paper?

Copy link
Contributor Author

Choose a reason for hiding this comment

The reason will be displayed to describe this comment to others. Learn more.

Is this the original method used in the paper?

Yes, this is the "BUILD" phase as stated in the paper
image

If a np.ndarray provided it must be of shape (n_clusters,) and contain
the indexes of the time series to use as centroids.
distance : str or Callable, default='msm'
Expand Down Expand Up @@ -428,6 +432,13 @@ def _assign_clusters(
def _check_params(self, X: np.ndarray) -> None:
self._random_state = check_random_state(self.random_state)

_incorrect_init_str = (
f"The value provided for init: {self.init} is "
f"invalid. The following are a list of valid init algorithms "
f"strings: random, kmedoids++, first, build. You can also pass a "
f"np.ndarray of size (n_clusters, n_channels, n_timepoints)"
)

if isinstance(self.init, str):
if self.init == "random":
self._init = self._random_center_initializer
Expand All @@ -437,16 +448,13 @@ def _check_params(self, X: np.ndarray) -> None:
self._init = self._first_center_initializer
elif self.init == "build":
self._init = self._pam_build_center_initializer
else:
raise ValueError(_incorrect_init_str)
else:
if isinstance(self.init, np.ndarray) and len(self.init) == self.n_clusters:
self._init = self.init
else:
raise ValueError(
f"The value provided for init: {self.init} is "
f"invalid. The following are a list of valid init algorithms "
f"strings: random, kmedoids++, first. You can also pass a"
f"np.ndarray of size (n_clusters, n_channels, n_timepoints)"
)
raise ValueError(_incorrect_init_str)

if self.distance_params is not None:
self._distance_params = self.distance_params
Expand Down
Loading