From 1e7671020d53201d4e269e1349e71938dc609a8b Mon Sep 17 00:00:00 2001 From: generall Date: Mon, 9 Nov 2015 19:33:47 +0300 Subject: [PATCH 01/13] minor changes --- lib/k_means/centroid.rb | 10 +++++++--- lib/k_means/k_means.rb | 5 +++-- 2 files changed, 10 insertions(+), 5 deletions(-) diff --git a/lib/k_means/centroid.rb b/lib/k_means/centroid.rb index 7bf02e2..71bd421 100644 --- a/lib/k_means/centroid.rb +++ b/lib/k_means/centroid.rb @@ -1,6 +1,9 @@ class Centroid class << self + + # initial centroid positions are randomly chosen from within + # a bounding box that encloses all the nodes def create_centroids(amount, nodes) ranges = create_ranges(nodes, nodes[0].position.size) (1..amount).map do @@ -13,14 +16,15 @@ def create_centroids(amount, nodes) private + # find centroid ranges as a bounding box for all nodes def create_ranges(nodes, dimensions) - ranges = Array.new(dimensions) {[0.0, 0.0]} + ranges = Array.new(dimensions) {[Float::NAN, Float::NAN]} nodes.each do |node| node.position.each_with_index do |position, index| # Bottom range - ranges[index][0] = position if position < ranges[index][0] + ranges[index][0] = position if ranges[index][0].nan? || position < ranges[index][0] # Top range - ranges[index][1] = position if position > ranges[index][1] + ranges[index][1] = position if ranges[index][1].nan? || position > ranges[index][1] end end ranges diff --git a/lib/k_means/k_means.rb b/lib/k_means/k_means.rb index f2138f7..e5ab415 100644 --- a/lib/k_means/k_means.rb +++ b/lib/k_means/k_means.rb @@ -2,7 +2,7 @@ class KMeans - attr_reader :centroids, :nodes + attr_reader :centroids, :nodes, :max_iterations def initialize(data, options={}) distance_measure = options[:distance_measure] || :euclidean_distance @@ -10,6 +10,7 @@ def initialize(data, options={}) @centroids = options[:custom_centroids] || Centroid.create_centroids(options[:centroids] || 4, @nodes) @verbose = options[:verbose] + @max_iterations = options[:max_iterations] || 100 perform_cluster_process end @@ -26,7 +27,7 @@ def view def perform_cluster_process iterations, updates = 0, 1 - while updates > 0 && iterations < 100 + while updates > 0 && iterations < max_iterations iterations += 1 verbose_message("Iteration #{iterations}") updates = 0 From cae0710cc32e48017e6f7fd8f0977a2025ca1103 Mon Sep 17 00:00:00 2001 From: generall Date: Tue, 10 Nov 2015 11:18:05 +0300 Subject: [PATCH 02/13] =?UTF-8?q?add=20Davies=E2=80=93Bouldin=20index?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lib/k_means/centroid.rb | 38 +++++++++++++++++++++++++++++--------- lib/k_means/k_means.rb | 22 ++++++++++++++++++++-- 2 files changed, 49 insertions(+), 11 deletions(-) diff --git a/lib/k_means/centroid.rb b/lib/k_means/centroid.rb index 71bd421..06f2fdd 100644 --- a/lib/k_means/centroid.rb +++ b/lib/k_means/centroid.rb @@ -1,7 +1,7 @@ class Centroid - + class << self - + # initial centroid positions are randomly chosen from within # a bounding box that encloses all the nodes def create_centroids(amount, nodes) @@ -13,10 +13,10 @@ def create_centroids(amount, nodes) new(position) end end - + private - - # find centroid ranges as a bounding box for all nodes + + # find centroi d ranges as a bounding box for all nodes def create_ranges(nodes, dimensions) ranges = Array.new(dimensions) {[Float::NAN, Float::NAN]} nodes.each do |node| @@ -30,13 +30,33 @@ def create_ranges(nodes, dimensions) ranges end end - + attr_accessor :position - + def initialize(position) @position = position + @mean_distance = nil end - + + def mean_node_distance + + return @mean_distance if @mean_distance + + total_dist = 0.0 + total_nodes = @nodes.size + + total_dist.reduce(0){|sum, node| sum + node.best_distance} + + if total_nodes > 0 + @mean_distance = total_dist/total_nodes + else + # if there no nodes in cluster, so the centroid is bad + @mean_distance = 1.0/0.0 + end + + @mean_distance + end + # Finds the average distance of all the nodes assigned to # the centroid and then moves the centroid to that position def reposition(nodes, centroids) @@ -49,5 +69,5 @@ def reposition(nodes, centroids) end @position = averages.map {|x| x / nodes.size} end - + end diff --git a/lib/k_means/k_means.rb b/lib/k_means/k_means.rb index e5ab415..6c92ad0 100644 --- a/lib/k_means/k_means.rb +++ b/lib/k_means/k_means.rb @@ -5,8 +5,8 @@ class KMeans attr_reader :centroids, :nodes, :max_iterations def initialize(data, options={}) - distance_measure = options[:distance_measure] || :euclidean_distance - @nodes = Node.create_nodes(data, distance_measure) + @distance_measure = options[:distance_measure] || :euclidean_distance + @nodes = Node.create_nodes(data, @distance_measure) @centroids = options[:custom_centroids] || Centroid.create_centroids(options[:centroids] || 4, @nodes) @verbose = options[:verbose] @@ -60,6 +60,24 @@ def update_nodes sum end + # Davies–Bouldin index: http://en.wikipedia.org/wiki/Cluster_analysis#Evaluation_of_clustering + def davies_bouldin_index(centroids = @centroids) + c_sz = centroids.size + db_index = 0 + 0..csz.each do |i| + max_db_index = -1.0/0 + 0..csz.each do |j| + if i != j + centroid_dist = centroids[i].position.send(@distance_measure, centroids[j].position) + sum_mean_nodes = centroids[i].mean_node_distance + centroids[j].mean_node_distance + max_db_index = [max_db_index, sum_mean_nodes / centroid_dist].max + end + end + db_index += max_db_index + end + return db_index/c_sz + end + def reposition_centroids centroid_positions = @centroids.map(&:position) @centroids.each do |centroid| From 703f583e4d7356060346470dfb2f0cda6fb4feca Mon Sep 17 00:00:00 2001 From: generall Date: Tue, 10 Nov 2015 11:26:27 +0300 Subject: [PATCH 03/13] =?UTF-8?q?make=20Davies=E2=80=93Bouldin=20public?= MIME-Version: 1.0 Content-Type: text/plain; charset=UTF-8 Content-Transfer-Encoding: 8bit --- lib/k_means/k_means.rb | 38 ++++++++++++++++++++------------------ 1 file changed, 20 insertions(+), 18 deletions(-) diff --git a/lib/k_means/k_means.rb b/lib/k_means/k_means.rb index 6c92ad0..fd91edf 100644 --- a/lib/k_means/k_means.rb +++ b/lib/k_means/k_means.rb @@ -23,6 +23,26 @@ def view @centroid_pockets end + # Davies–Bouldin index: http://en.wikipedia.org/wiki/Cluster_analysis#Evaluation_of_clustering + def davies_bouldin_index(centroids = @centroids) + c_sz = centroids.size + db_index = 0 + 0..csz.each do |i| + max_db_index = -1.0/0 + 0..csz.each do |j| + if i != j + centroid_dist = centroids[i].position.send(@distance_measure, centroids[j].position) + sum_mean_nodes = centroids[i].mean_node_distance + centroids[j].mean_node_distance + max_db_index = [max_db_index, sum_mean_nodes / centroid_dist].max + end + end + db_index += max_db_index + end + return db_index/c_sz + end + + + private def perform_cluster_process @@ -60,24 +80,6 @@ def update_nodes sum end - # Davies–Bouldin index: http://en.wikipedia.org/wiki/Cluster_analysis#Evaluation_of_clustering - def davies_bouldin_index(centroids = @centroids) - c_sz = centroids.size - db_index = 0 - 0..csz.each do |i| - max_db_index = -1.0/0 - 0..csz.each do |j| - if i != j - centroid_dist = centroids[i].position.send(@distance_measure, centroids[j].position) - sum_mean_nodes = centroids[i].mean_node_distance + centroids[j].mean_node_distance - max_db_index = [max_db_index, sum_mean_nodes / centroid_dist].max - end - end - db_index += max_db_index - end - return db_index/c_sz - end - def reposition_centroids centroid_positions = @centroids.map(&:position) @centroids.each do |centroid| From 0d3995cc473de939310522fda3b21ef3604f3a29 Mon Sep 17 00:00:00 2001 From: generall Date: Tue, 10 Nov 2015 11:39:05 +0300 Subject: [PATCH 04/13] fix NaN check --- lib/k_means/centroid.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/k_means/centroid.rb b/lib/k_means/centroid.rb index 06f2fdd..6945afa 100644 --- a/lib/k_means/centroid.rb +++ b/lib/k_means/centroid.rb @@ -22,9 +22,9 @@ def create_ranges(nodes, dimensions) nodes.each do |node| node.position.each_with_index do |position, index| # Bottom range - ranges[index][0] = position if ranges[index][0].nan? || position < ranges[index][0] + ranges[index][0] = position if ranges[index][0].to_f.nan? || position < ranges[index][0] # Top range - ranges[index][1] = position if ranges[index][1].nan? || position > ranges[index][1] + ranges[index][1] = position if ranges[index][1].to_f.nan? || position > ranges[index][1] end end ranges From 7299ca0e1bb19e8bd97afb376b4152074c40da17 Mon Sep 17 00:00:00 2001 From: generall Date: Tue, 10 Nov 2015 11:40:09 +0300 Subject: [PATCH 05/13] Fix errors --- lib/k_means/k_means.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/k_means/k_means.rb b/lib/k_means/k_means.rb index fd91edf..201c544 100644 --- a/lib/k_means/k_means.rb +++ b/lib/k_means/k_means.rb @@ -27,9 +27,9 @@ def view def davies_bouldin_index(centroids = @centroids) c_sz = centroids.size db_index = 0 - 0..csz.each do |i| + 0..c_sz.each do |i| max_db_index = -1.0/0 - 0..csz.each do |j| + 0..c_sz.each do |j| if i != j centroid_dist = centroids[i].position.send(@distance_measure, centroids[j].position) sum_mean_nodes = centroids[i].mean_node_distance + centroids[j].mean_node_distance From 35ef0a831fb22a36bd7a38125977cf0f73c78ce3 Mon Sep 17 00:00:00 2001 From: generall Date: Tue, 10 Nov 2015 11:41:51 +0300 Subject: [PATCH 06/13] Fix errors --- lib/k_means/k_means.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/k_means/k_means.rb b/lib/k_means/k_means.rb index 201c544..a4931e3 100644 --- a/lib/k_means/k_means.rb +++ b/lib/k_means/k_means.rb @@ -27,9 +27,9 @@ def view def davies_bouldin_index(centroids = @centroids) c_sz = centroids.size db_index = 0 - 0..c_sz.each do |i| + (0..c_sz).each do |i| max_db_index = -1.0/0 - 0..c_sz.each do |j| + (0..c_sz).each do |j| if i != j centroid_dist = centroids[i].position.send(@distance_measure, centroids[j].position) sum_mean_nodes = centroids[i].mean_node_distance + centroids[j].mean_node_distance From c06f63ae0482736e788e862a5dde1296a0af51bd Mon Sep 17 00:00:00 2001 From: generall Date: Tue, 10 Nov 2015 11:57:19 +0300 Subject: [PATCH 07/13] Fix errors --- lib/k_means/centroid.rb | 5 ++++- lib/k_means/k_means.rb | 4 +++- 2 files changed, 7 insertions(+), 2 deletions(-) diff --git a/lib/k_means/centroid.rb b/lib/k_means/centroid.rb index 6945afa..fdf3b4a 100644 --- a/lib/k_means/centroid.rb +++ b/lib/k_means/centroid.rb @@ -31,11 +31,12 @@ def create_ranges(nodes, dimensions) end end - attr_accessor :position + attr_accessor :position, :nodes def initialize(position) @position = position @mean_distance = nil + @nodes = [] end def mean_node_distance @@ -65,6 +66,8 @@ def reposition(nodes, centroids) nodes.each do |node| node.position.each_with_index do |position, index| averages[index] += position + #Store closest nodes in the centroid object + centroid.nodes << node end end @position = averages.map {|x| x / nodes.size} diff --git a/lib/k_means/k_means.rb b/lib/k_means/k_means.rb index a4931e3..306159c 100644 --- a/lib/k_means/k_means.rb +++ b/lib/k_means/k_means.rb @@ -41,7 +41,9 @@ def davies_bouldin_index(centroids = @centroids) return db_index/c_sz end - + def reset_nodes! + @nodes.each{|n| n.reset!} + end private From d7ba88bea906c744d79d72107c7d4c918bd412ae Mon Sep 17 00:00:00 2001 From: generall Date: Tue, 10 Nov 2015 11:59:43 +0300 Subject: [PATCH 08/13] Fix errors --- lib/k_means/centroid.rb | 2 -- lib/k_means/k_means.rb | 2 ++ 2 files changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/k_means/centroid.rb b/lib/k_means/centroid.rb index fdf3b4a..dc17703 100644 --- a/lib/k_means/centroid.rb +++ b/lib/k_means/centroid.rb @@ -66,8 +66,6 @@ def reposition(nodes, centroids) nodes.each do |node| node.position.each_with_index do |position, index| averages[index] += position - #Store closest nodes in the centroid object - centroid.nodes << node end end @position = averages.map {|x| x / nodes.size} diff --git a/lib/k_means/k_means.rb b/lib/k_means/k_means.rb index 306159c..76c10c0 100644 --- a/lib/k_means/k_means.rb +++ b/lib/k_means/k_means.rb @@ -68,6 +68,8 @@ def place_nodes_into_pockets @nodes.each_with_index do |node, node_index| if node.closest_centroid == centroid centroid_pockets[centroid_index] << node_index + #Store closest nodes in the centroid object + centroid.nodes << node end end end From 8415bd0d30ebe528330f1d083940fc54758e8c43 Mon Sep 17 00:00:00 2001 From: generall Date: Tue, 10 Nov 2015 12:05:44 +0300 Subject: [PATCH 09/13] Fix errors --- lib/k_means/centroid.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/k_means/centroid.rb b/lib/k_means/centroid.rb index dc17703..5b912aa 100644 --- a/lib/k_means/centroid.rb +++ b/lib/k_means/centroid.rb @@ -46,7 +46,7 @@ def mean_node_distance total_dist = 0.0 total_nodes = @nodes.size - total_dist.reduce(0){|sum, node| sum + node.best_distance} + @nodes.reduce(0){|sum, node| sum + node.best_distance} if total_nodes > 0 @mean_distance = total_dist/total_nodes From e7e70046be1bbc0687528bcf501707d4e8b052f1 Mon Sep 17 00:00:00 2001 From: generall Date: Tue, 10 Nov 2015 12:07:13 +0300 Subject: [PATCH 10/13] Fix errors --- lib/k_means/k_means.rb | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/lib/k_means/k_means.rb b/lib/k_means/k_means.rb index 76c10c0..6eb8f28 100644 --- a/lib/k_means/k_means.rb +++ b/lib/k_means/k_means.rb @@ -27,9 +27,9 @@ def view def davies_bouldin_index(centroids = @centroids) c_sz = centroids.size db_index = 0 - (0..c_sz).each do |i| + (0..(c_sz - 1)).each do |i| max_db_index = -1.0/0 - (0..c_sz).each do |j| + (0..(c_sz - 1)).each do |j| if i != j centroid_dist = centroids[i].position.send(@distance_measure, centroids[j].position) sum_mean_nodes = centroids[i].mean_node_distance + centroids[j].mean_node_distance From 793ff9351032f81fbb520c1c1535193dc44deec6 Mon Sep 17 00:00:00 2001 From: generall Date: Tue, 10 Nov 2015 12:13:15 +0300 Subject: [PATCH 11/13] Fix errors --- lib/k_means/centroid.rb | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/lib/k_means/centroid.rb b/lib/k_means/centroid.rb index 5b912aa..57ce7a1 100644 --- a/lib/k_means/centroid.rb +++ b/lib/k_means/centroid.rb @@ -46,7 +46,7 @@ def mean_node_distance total_dist = 0.0 total_nodes = @nodes.size - @nodes.reduce(0){|sum, node| sum + node.best_distance} + total_dist = @nodes.reduce(0){|sum, node| sum + node.best_distance} if total_nodes > 0 @mean_distance = total_dist/total_nodes From bfc53d838ec33947df24a980d16b45bf5750a1aa Mon Sep 17 00:00:00 2001 From: generall Date: Wed, 11 Nov 2015 12:33:52 +0300 Subject: [PATCH 12/13] Add iterative clustering --- Rakefile | 2 +- lib/k_means/k_means.rb | 34 ++++++++++++++------ test/k_means/test_k_means.rb | 62 +++++++++++++++++++++++++++++------- 3 files changed, 77 insertions(+), 21 deletions(-) diff --git a/Rakefile b/Rakefile index 1e817d3..502a68b 100644 --- a/Rakefile +++ b/Rakefile @@ -43,7 +43,7 @@ end task :default => :test -require 'rake/rdoctask' +require 'rdoc/task' Rake::RDocTask.new do |rdoc| if File.exist?('VERSION') version = File.read('VERSION') diff --git a/lib/k_means/k_means.rb b/lib/k_means/k_means.rb index 6eb8f28..1ecfd6b 100644 --- a/lib/k_means/k_means.rb +++ b/lib/k_means/k_means.rb @@ -2,17 +2,33 @@ class KMeans - attr_reader :centroids, :nodes, :max_iterations + attr_reader :centroids, :nodes, :max_iterations, :max_tries def initialize(data, options={}) - @distance_measure = options[:distance_measure] || :euclidean_distance - @nodes = Node.create_nodes(data, @distance_measure) - @centroids = options[:custom_centroids] || - Centroid.create_centroids(options[:centroids] || 4, @nodes) - @verbose = options[:verbose] - @max_iterations = options[:max_iterations] || 100 - - perform_cluster_process + + @max_tries = options[:max_tries] || 10 + + @best_DBI = Float::INFINITY + @best_centriods = nil + + @max_tries.times do || + @distance_measure = options[:distance_measure] || :euclidean_distance + @nodes = Node.create_nodes(data, @distance_measure) + @centroids = options[:custom_centroids] || + Centroid.create_centroids(options[:centroids] || 4, @nodes) + @verbose = options[:verbose] + @max_iterations = options[:max_iterations] || 100 + perform_cluster_process + dbi = davies_bouldin_index + if dbi < @best_DBI + @best_DBI = dbi + @best_centriods = @centroids + end + end + + @centroid = @best_centriods + place_nodes_into_pockets + end def inspect diff --git a/test/k_means/test_k_means.rb b/test/k_means/test_k_means.rb index 82bed8b..6f9a5d3 100644 --- a/test/k_means/test_k_means.rb +++ b/test/k_means/test_k_means.rb @@ -1,11 +1,15 @@ require 'helper' + class TestKMeans < Test::Unit::TestCase + context "A KMeans Instance" do setup do @data = Array.new(3) {Array.new(2) {rand}} @kmeans = KMeans.new(@data, :centroids => 2, :distance_measure => :cosine_similarity) + @kmeans2 = KMeans.new(data, :centroids => 2, :distance_measure => :euclidean_distance) + @kmeans3 = KMeans.new(data, :centroids => 3, :distance_measure => :euclidean_distance) end should "return an array" do @@ -20,31 +24,67 @@ class TestKMeans < Test::Unit::TestCase assert_equal 3, @kmeans.nodes.size end + should "DBI of 3 clusters less than 2 clusters" do + assert @kmeans2.davies_bouldin_index > @kmeans3.davies_bouldin_index + end + end + context "A KMeans Instance with specified initial centroids" do setup do @data = Array.new(3) {Array.new(2) {rand}} - class CustomCentroid - attr_accessor :position - def initialize(position); @position = position; end - def reposition(nodes, centroid_positions); end - end - @specified_centroids = @data[0..2].map { |d| CustomCentroid.new(d) } - @kmeans = KMeans.new(@data, :custom_centroids => @specified_centroids, :distance_measure => :cosine_similarity) + #@specified_centroids = @data[0..2].map { |d| CustomCentroid.new(d) } + @kmeans = KMeans.new(@data, :centroids => 3, :distance_measure => :cosine_similarity) end should "return an inspected array" do - assert_kind_of String, @kmeans.inspect + assert_kind_of(String, @kmeans.inspect) end should "have 3 centroids" do - assert_equal 3, @kmeans.centroids.size - end + assert_equal(3, @kmeans.centroids.size) + end should "have 3 nodes" do - assert_equal 3, @kmeans.nodes.size + assert_equal(3, @kmeans.nodes.size) end end + + def data + [ + [ 1.0, 1.0 ], + [ 2.0, 1.0 ], + [ 2.0, 1.0 ], + [ 1.5, 1.0 ], + [ 1.8, 1.5 ], + [ 1.2, 1.4 ], + [ 1.9, 2.0 ], + [ 1.9, 1.0 ], + [ 1.0, 1.9 ], + [ 2.0, 2.0 ], + [ 4.3, 1.1 ], + [ 4.8, 1.2 ], + [ 4.3, 1.5 ], + [ 4.4, 1.2 ], + [ 4.1, 1.9 ], + [ 4.8, 1.7 ], + [ 4.2, 1.1 ], + [ 4.4, 1.7 ], + [ 5.8, 5.2 ], + [ 5.2, 6.2 ], + [ 5.5, 6.2 ], + [ 5.1, 6.2 ], + [ 5.2, 6.3 ], + [ 5.8, 6.1 ], + [ 5.6, 6.3 ], + [ 5.3, 6.4 ], + [ 5.5, 6.1 ], + [ 5.1, 6.2 ], + [ 5.9, 6.2 ], + [ 6.2, 5.1 ] + ] + end + end From f45a46efa095eadcabaa534ed6b325277d8bff28 Mon Sep 17 00:00:00 2001 From: generall Date: Thu, 12 Nov 2015 09:48:53 +0300 Subject: [PATCH 13/13] Add iterative centroid initialization --- lib/k_means/k_means.rb | 56 ++++++++++++++++++++++++++---------- lib/k_means/node.rb | 6 ++-- test/k_means/test_k_means.rb | 11 +++++-- 3 files changed, 53 insertions(+), 20 deletions(-) diff --git a/lib/k_means/k_means.rb b/lib/k_means/k_means.rb index 1ecfd6b..77a819b 100644 --- a/lib/k_means/k_means.rb +++ b/lib/k_means/k_means.rb @@ -1,4 +1,5 @@ require 'ext/object' +require 'pry' class KMeans @@ -9,26 +10,44 @@ def initialize(data, options={}) @max_tries = options[:max_tries] || 10 @best_DBI = Float::INFINITY - @best_centriods = nil + @best_centroids = nil + @distance_measure = options[:distance_measure] || :euclidean_distance + @nodes = Node.create_nodes(data, @distance_measure) + @max_iterations = options[:max_iterations] || 100 + @verbose = options[:verbose] - @max_tries.times do || - @distance_measure = options[:distance_measure] || :euclidean_distance - @nodes = Node.create_nodes(data, @distance_measure) + @centroid_count = options[:centroids] || 4 + + raise "Too many centroids(#{@centroid_count}) for #{@nodes.size} nodes" if @centroid_count > @nodes.size + + @max_tries.times do |n| + reset_nodes! @centroids = options[:custom_centroids] || - Centroid.create_centroids(options[:centroids] || 4, @nodes) - @verbose = options[:verbose] - @max_iterations = options[:max_iterations] || 100 + Centroid.create_centroids(@centroid_count, @nodes) + perform_cluster_process dbi = davies_bouldin_index if dbi < @best_DBI + p "#{n} #{dbi.round(2)} vs #{@best_DBI.round(2)}" if options[:debug] @best_DBI = dbi - @best_centriods = @centroids + @best_centroids = @centroids end end - @centroid = @best_centriods + + if @best_centroids != nil + # can not fit nodes to all clusters + @centroids = @best_centroids + end + + + + update_nodes(true) # force update + assign_nodes_to_centriods place_nodes_into_pockets + #binding.pry if options[:debug] + end def inspect @@ -58,7 +77,9 @@ def davies_bouldin_index(centroids = @centroids) end def reset_nodes! - @nodes.each{|n| n.reset!} + if @nodes + @nodes.each{|n| n.reset!} + end end private @@ -72,7 +93,7 @@ def perform_cluster_process updates += update_nodes reposition_centroids end - place_nodes_into_pockets + assign_nodes_to_centriods end # This creates an array of arrays @@ -84,18 +105,23 @@ def place_nodes_into_pockets @nodes.each_with_index do |node, node_index| if node.closest_centroid == centroid centroid_pockets[centroid_index] << node_index - #Store closest nodes in the centroid object - centroid.nodes << node end end end @centroid_pockets = centroid_pockets end - def update_nodes + def assign_nodes_to_centriods + @nodes.each_with_index do |node, node_index| + #Store closest nodes in the centroid object + node.closest_centroid.nodes << node + end + end + + def update_nodes(force = false) sum = 0 @nodes.each do |node| - sum += node.update_closest_centroid(@centroids) + sum += node.update_closest_centroid(@centroids, force) end sum end diff --git a/lib/k_means/node.rb b/lib/k_means/node.rb index 64f2ed9..cc09fc2 100644 --- a/lib/k_means/node.rb +++ b/lib/k_means/node.rb @@ -17,10 +17,10 @@ def initialize(position, similarity_measure) @similarity_measure = similarity_measure end - def update_closest_centroid(centroids) + def update_closest_centroid(centroids, force = false) # If we haven't processed this node we need to give it an initial centroid # so that we have something to compare distances against - calculate_initial_centroid(centroids.first) unless @closest_centroid + calculate_initial_centroid(centroids.first) if (!@closest_centroid || force) updated = false centroids.each do |centroid| @@ -60,7 +60,7 @@ def calculate_distance(centroid) begin @position.send(@similarity_measure, centroid.position) rescue NoMethodError - raise "Hey, '#{@similarity_measure}' is not a measurement. Read the REAdME for available measurements" + raise "Hey, '#{@similarity_measure}' is not a measurement. Read the README for available measurements" end end diff --git a/test/k_means/test_k_means.rb b/test/k_means/test_k_means.rb index 6f9a5d3..469a839 100644 --- a/test/k_means/test_k_means.rb +++ b/test/k_means/test_k_means.rb @@ -8,8 +8,6 @@ class TestKMeans < Test::Unit::TestCase setup do @data = Array.new(3) {Array.new(2) {rand}} @kmeans = KMeans.new(@data, :centroids => 2, :distance_measure => :cosine_similarity) - @kmeans2 = KMeans.new(data, :centroids => 2, :distance_measure => :euclidean_distance) - @kmeans3 = KMeans.new(data, :centroids => 3, :distance_measure => :euclidean_distance) end should "return an array" do @@ -24,6 +22,15 @@ class TestKMeans < Test::Unit::TestCase assert_equal 3, @kmeans.nodes.size end + + end + + context "A Many tries" do + setup do + @kmeans2 = KMeans.new(data, :centroids => 2, :distance_measure => :euclidean_distance, :max_tries => 50) + @kmeans3 = KMeans.new(data, :centroids => 3, :distance_measure => :euclidean_distance, :max_tries => 50) + end + should "DBI of 3 clusters less than 2 clusters" do assert @kmeans2.davies_bouldin_index > @kmeans3.davies_bouldin_index end