Added kmeans clustering (#595)

avi09 · trekhleb · web-flow · commit b7cd425ce9a3 · 2020-12-19T19:36:08.000+01:00
* added kmeans

* added kmeans

* added kmeans

Co-authored-by: Oleksii Trekhleb &lt;trehleb@gmail.com&gt;
diff --git a/README.md b/README.md
@@ -147,6 +147,7 @@ a set of rules that precisely define a sequence of operations.
 * **Machine Learning**
   * `B` [NanoNeuron](https://github.com/trekhleb/nano-neuron) - 7 simple JS functions that illustrate how machines can actually learn (forward/backward propagation)
   * `B` [k-NN](src/algorithms/ml/knn) - k-nearest neighbors classification algorithm
+  * `B` [k-Means](src/algorithms/ml/kmeans) - k-Means clustering algorithm
 * **Uncategorized**
   * `B` [Tower of Hanoi](src/algorithms/uncategorized/hanoi-tower)
   * `B` [Square Matrix Rotation](src/algorithms/uncategorized/square-matrix-rotation) - in-place algorithm
diff --git a/src/algorithms/ml/kmeans/README.md b/src/algorithms/ml/kmeans/README.md
@@ -0,0 +1,32 @@
+# k-Means Algorithm
+
+The **k-Means algorithm** is an unsupervised Machine Learning algorithm. It's a clustering algorithm, which groups the sample data on the basis of similarity between dimentions of vectors.
+
+In k-Means classification, the output is a set of classess asssigned to each vector. Each cluster location is continously optimized in order to get the accurate locations of each cluster such that they represent each group clearly.
+
+The idea is to calculate the similarity between cluster location and data vectors, and reassign clusters based on it. [Euclidean distance](https://en.wikipedia.org/wiki/Euclidean_distance) is used mostly for this task.
+
+![Euclidean distance between two points](https://upload.wikimedia.org/wikipedia/commons/5/55/Euclidean_distance_2d.svg)
+
+_Image source: [Wikipedia](https://en.wikipedia.org/wiki/Euclidean_distance)_
+
+The algorithm is as follows:
+
+1. Check for errors like invalid/inconsistent data
+2. Initialize the k cluster locations with initial/random k points
+3. Calculate the distance of each data point from each cluster
+4. Assign the cluster label of each data point equal to that of the cluster at it's minimum distance
+5. Calculate the centroid of each cluster based on the data points it contains
+6. Repeat each of the above steps until the centroid locations are varying
+
+Here is a visualization of k-Means clustering for better understanding:
+
+![KNN Visualization 1](https://upload.wikimedia.org/wikipedia/commons/e/ea/K-means_convergence.gif)
+
+_Image source: [Wikipedia](https://en.wikipedia.org/wiki/K-nearest_neighbors_algorithm)_
+
+The centroids are moving continously in order to create better distinction between the different set of data points. As we can see, after a few iterations, the difference in centroids is quite low between iterations. For example between itrations `13` and `14` the difference is quite small because there the optimizer is tuning boundary cases.
+
+## References
+
+- [k-Means neighbors algorithm on Wikipedia](https://en.wikipedia.org/wiki/K-means_clustering)
diff --git a/src/algorithms/ml/kmeans/__test__/kmeans.test.js b/src/algorithms/ml/kmeans/__test__/kmeans.test.js
@@ -0,0 +1,36 @@
+import kMeans from '../kmeans';
+
+describe('kMeans', () => {
+  it('should throw an error on invalid data', () => {
+    expect(() => {
+      kMeans();
+    }).toThrowError('Either dataSet or labels or toClassify were not set');
+  });
+
+  it('should throw an error on inconsistent data', () => {
+    expect(() => {
+      kMeans([[1, 2], [1]], 2);
+    }).toThrowError('Inconsistent vector lengths');
+  });
+
+  it('should find the nearest neighbour', () => {
+    const dataSet = [[1, 1], [6, 2], [3, 3], [4, 5], [9, 2], [2, 4], [8, 7]];
+    const k = 2;
+    const expectedCluster = [0, 1, 0, 1, 1, 0, 1];
+    expect(kMeans(dataSet, k)).toEqual(expectedCluster);
+  });
+
+  it('should find the clusters with equal distances', () => {
+    const dataSet = [[0, 0], [1, 1], [2, 2]];
+    const k = 3;
+    const expectedCluster = [0, 1, 2];
+    expect(kMeans(dataSet, k)).toEqual(expectedCluster);
+  });
+
+  it('should find the nearest neighbour in 3D space', () => {
+    const dataSet = [[0, 0, 0], [0, 1, 0], [2, 0, 2]];
+    const k = 2;
+    const expectedCluster = [1, 1, 0];
+    expect(kMeans(dataSet, k)).toEqual(expectedCluster);
+  });
+});
diff --git a/src/algorithms/ml/kmeans/kmeans.js b/src/algorithms/ml/kmeans/kmeans.js
@@ -0,0 +1,98 @@
+/**
+ * Calculates calculate the euclidean distance between 2 vectors.
+ *
+ * @param {number[]} x1
+ * @param {number[]} x2
+ * @returns {number}
+ */
+function euclideanDistance(x1, x2) {
+  // Checking for errors.
+  if (x1.length !== x2.length) {
+    throw new Error('Inconsistent vector lengths');
+  }
+  // Calculate the euclidean distance between 2 vectors and return.
+  let squaresTotal = 0;
+  for (let i = 0; i < x1.length; i += 1) {
+    squaresTotal += (x1[i] - x2[i]) ** 2;
+  }
+  return Number(Math.sqrt(squaresTotal).toFixed(2));
+}
+/**
+ * Classifies the point in space based on k-nearest neighbors algorithm.
+ *
+ * @param {number[][]} dataSet - array of dataSet points, i.e. [[0, 1], [3, 4], [5, 7]]
+ * @param {number} k - number of nearest neighbors which will be taken into account (preferably odd)
+ * @return {number[]} - the class of the point
+ */
+export default function kMeans(
+  dataSetm,
+  k = 1,
+) {
+  const dataSet = dataSetm;
+  if (!dataSet) {
+    throw new Error('Either dataSet or labels or toClassify were not set');
+  }
+
+  // starting algorithm
+  // assign k clusters locations equal to the location of initial k points
+  const clusterCenters = [];
+  const nDim = dataSet[0].length;
+  for (let i = 0; i < k; i += 1) {
+    clusterCenters[clusterCenters.length] = Array.from(dataSet[i]);
+  }
+
+  // continue optimization till convergence
+  // centroids should not be moving once optimized
+  // calculate distance of each candidate vector from each cluster center
+  // assign cluster number to each data vector according to minimum distance
+  let flag = true;
+  while (flag) {
+    flag = false;
+    // calculate and store distance of each dataSet point from each cluster
+    for (let i = 0; i < dataSet.length; i += 1) {
+      for (let n = 0; n < k; n += 1) {
+        dataSet[i][nDim + n] = euclideanDistance(clusterCenters[n], dataSet[i].slice(0, nDim));
+      }
+
+      // assign the cluster number to each dataSet point
+      const sliced = dataSet[i].slice(nDim, nDim + k);
+      let minmDistCluster = Math.min(...sliced);
+      for (let j = 0; j < sliced.length; j += 1) {
+        if (minmDistCluster === sliced[j]) {
+          minmDistCluster = j;
+          break;
+        }
+      }
+
+      if (dataSet[i].length !== nDim + k + 1) {
+        flag = true;
+        dataSet[i][nDim + k] = minmDistCluster;
+      } else if (dataSet[i][nDim + k] !== minmDistCluster) {
+        flag = true;
+        dataSet[i][nDim + k] = minmDistCluster;
+      }
+    }
+    // recalculate cluster centriod values via all dimensions of the points under it
+    for (let i = 0; i < k; i += 1) {
+      clusterCenters[i] = Array(nDim).fill(0);
+      let classCount = 0;
+      for (let j = 0; j < dataSet.length; j += 1) {
+        if (dataSet[j][dataSet[j].length - 1] === i) {
+          classCount += 1;
+          for (let n = 0; n < nDim; n += 1) {
+            clusterCenters[i][n] += dataSet[j][n];
+          }
+        }
+      }
+      for (let n = 0; n < nDim; n += 1) {
+        clusterCenters[i][n] = Number((clusterCenters[i][n] / classCount).toFixed(2));
+      }
+    }
+  }
+  // return the clusters assigned
+  const soln = [];
+  for (let i = 0; i < dataSet.length; i += 1) {
+    soln.push(dataSet[i][dataSet[i].length - 1]);
+  }
+  return soln;
+}