diff --git a/Rakefile b/Rakefile index 1e817d3..502a68b 100644 --- a/Rakefile +++ b/Rakefile @@ -43,7 +43,7 @@ end task :default => :test -require 'rake/rdoctask' +require 'rdoc/task' Rake::RDocTask.new do |rdoc| if File.exist?('VERSION') version = File.read('VERSION') diff --git a/lib/k_means/centroid.rb b/lib/k_means/centroid.rb index 71bd421..de5a175 100644 --- a/lib/k_means/centroid.rb +++ b/lib/k_means/centroid.rb @@ -30,13 +30,34 @@ def create_ranges(nodes, dimensions) ranges end end - - attr_accessor :position - + + attr_accessor :position, :nodes + def initialize(position) @position = position + @mean_distance = nil + @nodes = [] end - + + def mean_node_distance + + return @mean_distance if @mean_distance + + total_dist = 0.0 + total_nodes = @nodes.size + + total_dist = @nodes.reduce(0){|sum, node| sum + node.best_distance} + + if total_nodes > 0 + @mean_distance = total_dist/total_nodes + else + # if there no nodes in cluster, so the centroid is bad + @mean_distance = 1.0/0.0 + end + + @mean_distance + end + # Finds the average distance of all the nodes assigned to # the centroid and then moves the centroid to that position def reposition(nodes, centroids) diff --git a/lib/k_means/k_means.rb b/lib/k_means/k_means.rb index f2138f7..77a819b 100644 --- a/lib/k_means/k_means.rb +++ b/lib/k_means/k_means.rb @@ -1,17 +1,53 @@ require 'ext/object' +require 'pry' class KMeans - attr_reader :centroids, :nodes + attr_reader :centroids, :nodes, :max_iterations, :max_tries def initialize(data, options={}) - distance_measure = options[:distance_measure] || :euclidean_distance - @nodes = Node.create_nodes(data, distance_measure) - @centroids = options[:custom_centroids] || - Centroid.create_centroids(options[:centroids] || 4, @nodes) + + @max_tries = options[:max_tries] || 10 + + @best_DBI = Float::INFINITY + @best_centroids = nil + @distance_measure = options[:distance_measure] || :euclidean_distance + @nodes = Node.create_nodes(data, @distance_measure) + @max_iterations = options[:max_iterations] || 100 @verbose = options[:verbose] - perform_cluster_process + @centroid_count = options[:centroids] || 4 + + raise "Too many centroids(#{@centroid_count}) for #{@nodes.size} nodes" if @centroid_count > @nodes.size + + @max_tries.times do |n| + reset_nodes! + @centroids = options[:custom_centroids] || + Centroid.create_centroids(@centroid_count, @nodes) + + perform_cluster_process + dbi = davies_bouldin_index + if dbi < @best_DBI + p "#{n} #{dbi.round(2)} vs #{@best_DBI.round(2)}" if options[:debug] + @best_DBI = dbi + @best_centroids = @centroids + end + end + + + if @best_centroids != nil + # can not fit nodes to all clusters + @centroids = @best_centroids + end + + + + update_nodes(true) # force update + assign_nodes_to_centriods + place_nodes_into_pockets + + #binding.pry if options[:debug] + end def inspect @@ -22,18 +58,42 @@ def view @centroid_pockets end + # Davies–Bouldin index: http://en.wikipedia.org/wiki/Cluster_analysis#Evaluation_of_clustering + def davies_bouldin_index(centroids = @centroids) + c_sz = centroids.size + db_index = 0 + (0..(c_sz - 1)).each do |i| + max_db_index = -1.0/0 + (0..(c_sz - 1)).each do |j| + if i != j + centroid_dist = centroids[i].position.send(@distance_measure, centroids[j].position) + sum_mean_nodes = centroids[i].mean_node_distance + centroids[j].mean_node_distance + max_db_index = [max_db_index, sum_mean_nodes / centroid_dist].max + end + end + db_index += max_db_index + end + return db_index/c_sz + end + + def reset_nodes! + if @nodes + @nodes.each{|n| n.reset!} + end + end + private def perform_cluster_process iterations, updates = 0, 1 - while updates > 0 && iterations < 100 + while updates > 0 && iterations < max_iterations iterations += 1 verbose_message("Iteration #{iterations}") updates = 0 updates += update_nodes reposition_centroids end - place_nodes_into_pockets + assign_nodes_to_centriods end # This creates an array of arrays @@ -51,10 +111,17 @@ def place_nodes_into_pockets @centroid_pockets = centroid_pockets end - def update_nodes + def assign_nodes_to_centriods + @nodes.each_with_index do |node, node_index| + #Store closest nodes in the centroid object + node.closest_centroid.nodes << node + end + end + + def update_nodes(force = false) sum = 0 @nodes.each do |node| - sum += node.update_closest_centroid(@centroids) + sum += node.update_closest_centroid(@centroids, force) end sum end diff --git a/lib/k_means/node.rb b/lib/k_means/node.rb index 64f2ed9..cc09fc2 100644 --- a/lib/k_means/node.rb +++ b/lib/k_means/node.rb @@ -17,10 +17,10 @@ def initialize(position, similarity_measure) @similarity_measure = similarity_measure end - def update_closest_centroid(centroids) + def update_closest_centroid(centroids, force = false) # If we haven't processed this node we need to give it an initial centroid # so that we have something to compare distances against - calculate_initial_centroid(centroids.first) unless @closest_centroid + calculate_initial_centroid(centroids.first) if (!@closest_centroid || force) updated = false centroids.each do |centroid| @@ -60,7 +60,7 @@ def calculate_distance(centroid) begin @position.send(@similarity_measure, centroid.position) rescue NoMethodError - raise "Hey, '#{@similarity_measure}' is not a measurement. Read the REAdME for available measurements" + raise "Hey, '#{@similarity_measure}' is not a measurement. Read the README for available measurements" end end diff --git a/test/k_means/test_k_means.rb b/test/k_means/test_k_means.rb index 82bed8b..469a839 100644 --- a/test/k_means/test_k_means.rb +++ b/test/k_means/test_k_means.rb @@ -1,6 +1,8 @@ require 'helper' + class TestKMeans < Test::Unit::TestCase + context "A KMeans Instance" do setup do @@ -20,31 +22,76 @@ class TestKMeans < Test::Unit::TestCase assert_equal 3, @kmeans.nodes.size end + end + context "A Many tries" do + setup do + @kmeans2 = KMeans.new(data, :centroids => 2, :distance_measure => :euclidean_distance, :max_tries => 50) + @kmeans3 = KMeans.new(data, :centroids => 3, :distance_measure => :euclidean_distance, :max_tries => 50) + end + + should "DBI of 3 clusters less than 2 clusters" do + assert @kmeans2.davies_bouldin_index > @kmeans3.davies_bouldin_index + end + + end + + context "A KMeans Instance with specified initial centroids" do setup do @data = Array.new(3) {Array.new(2) {rand}} - class CustomCentroid - attr_accessor :position - def initialize(position); @position = position; end - def reposition(nodes, centroid_positions); end - end - @specified_centroids = @data[0..2].map { |d| CustomCentroid.new(d) } - @kmeans = KMeans.new(@data, :custom_centroids => @specified_centroids, :distance_measure => :cosine_similarity) + #@specified_centroids = @data[0..2].map { |d| CustomCentroid.new(d) } + @kmeans = KMeans.new(@data, :centroids => 3, :distance_measure => :cosine_similarity) end should "return an inspected array" do - assert_kind_of String, @kmeans.inspect + assert_kind_of(String, @kmeans.inspect) end should "have 3 centroids" do - assert_equal 3, @kmeans.centroids.size - end + assert_equal(3, @kmeans.centroids.size) + end should "have 3 nodes" do - assert_equal 3, @kmeans.nodes.size + assert_equal(3, @kmeans.nodes.size) end end + + def data + [ + [ 1.0, 1.0 ], + [ 2.0, 1.0 ], + [ 2.0, 1.0 ], + [ 1.5, 1.0 ], + [ 1.8, 1.5 ], + [ 1.2, 1.4 ], + [ 1.9, 2.0 ], + [ 1.9, 1.0 ], + [ 1.0, 1.9 ], + [ 2.0, 2.0 ], + [ 4.3, 1.1 ], + [ 4.8, 1.2 ], + [ 4.3, 1.5 ], + [ 4.4, 1.2 ], + [ 4.1, 1.9 ], + [ 4.8, 1.7 ], + [ 4.2, 1.1 ], + [ 4.4, 1.7 ], + [ 5.8, 5.2 ], + [ 5.2, 6.2 ], + [ 5.5, 6.2 ], + [ 5.1, 6.2 ], + [ 5.2, 6.3 ], + [ 5.8, 6.1 ], + [ 5.6, 6.3 ], + [ 5.3, 6.4 ], + [ 5.5, 6.1 ], + [ 5.1, 6.2 ], + [ 5.9, 6.2 ], + [ 6.2, 5.1 ] + ] + end + end