Skip to content
This repository was archived by the owner on Feb 26, 2019. It is now read-only.
Open
Show file tree
Hide file tree
Changes from all commits
Commits
File filter

Filter by extension

Filter by extension

Conversations
Failed to load comments.
Loading
Jump to
Jump to file
Failed to load files.
Loading
Diff view
Diff view
2 changes: 1 addition & 1 deletion Rakefile
Original file line number Diff line number Diff line change
Expand Up @@ -43,7 +43,7 @@ end

task :default => :test

require 'rake/rdoctask'
require 'rdoc/task'
Rake::RDocTask.new do |rdoc|
if File.exist?('VERSION')
version = File.read('VERSION')
Expand Down
29 changes: 25 additions & 4 deletions lib/k_means/centroid.rb
Original file line number Diff line number Diff line change
Expand Up @@ -30,13 +30,34 @@ def create_ranges(nodes, dimensions)
ranges
end
end
attr_accessor :position

attr_accessor :position, :nodes

def initialize(position)
@position = position
@mean_distance = nil
@nodes = []
end


def mean_node_distance

return @mean_distance if @mean_distance

total_dist = 0.0
total_nodes = @nodes.size

total_dist = @nodes.reduce(0){|sum, node| sum + node.best_distance}

if total_nodes > 0
@mean_distance = total_dist/total_nodes
else
# if there no nodes in cluster, so the centroid is bad
@mean_distance = 1.0/0.0
end

@mean_distance
end

# Finds the average distance of all the nodes assigned to
# the centroid and then moves the centroid to that position
def reposition(nodes, centroids)
Expand Down
87 changes: 77 additions & 10 deletions lib/k_means/k_means.rb
Original file line number Diff line number Diff line change
@@ -1,17 +1,53 @@
require 'ext/object'
require 'pry'

class KMeans

attr_reader :centroids, :nodes
attr_reader :centroids, :nodes, :max_iterations, :max_tries

def initialize(data, options={})
distance_measure = options[:distance_measure] || :euclidean_distance
@nodes = Node.create_nodes(data, distance_measure)
@centroids = options[:custom_centroids] ||
Centroid.create_centroids(options[:centroids] || 4, @nodes)

@max_tries = options[:max_tries] || 10

@best_DBI = Float::INFINITY
@best_centroids = nil
@distance_measure = options[:distance_measure] || :euclidean_distance
@nodes = Node.create_nodes(data, @distance_measure)
@max_iterations = options[:max_iterations] || 100
@verbose = options[:verbose]

perform_cluster_process
@centroid_count = options[:centroids] || 4

raise "Too many centroids(#{@centroid_count}) for #{@nodes.size} nodes" if @centroid_count > @nodes.size

@max_tries.times do |n|
reset_nodes!
@centroids = options[:custom_centroids] ||
Centroid.create_centroids(@centroid_count, @nodes)

perform_cluster_process
dbi = davies_bouldin_index
if dbi < @best_DBI
p "#{n} #{dbi.round(2)} vs #{@best_DBI.round(2)}" if options[:debug]
@best_DBI = dbi
@best_centroids = @centroids
end
end


if @best_centroids != nil
# can not fit nodes to all clusters
@centroids = @best_centroids
end



update_nodes(true) # force update
assign_nodes_to_centriods
place_nodes_into_pockets

#binding.pry if options[:debug]

end

def inspect
Expand All @@ -22,18 +58,42 @@ def view
@centroid_pockets
end

# Davies–Bouldin index: http://en.wikipedia.org/wiki/Cluster_analysis#Evaluation_of_clustering
def davies_bouldin_index(centroids = @centroids)
c_sz = centroids.size
db_index = 0
(0..(c_sz - 1)).each do |i|
max_db_index = -1.0/0
(0..(c_sz - 1)).each do |j|
if i != j
centroid_dist = centroids[i].position.send(@distance_measure, centroids[j].position)
sum_mean_nodes = centroids[i].mean_node_distance + centroids[j].mean_node_distance
max_db_index = [max_db_index, sum_mean_nodes / centroid_dist].max
end
end
db_index += max_db_index
end
return db_index/c_sz
end

def reset_nodes!
if @nodes
@nodes.each{|n| n.reset!}
end
end

private

def perform_cluster_process
iterations, updates = 0, 1
while updates > 0 && iterations < 100
while updates > 0 && iterations < max_iterations
iterations += 1
verbose_message("Iteration #{iterations}")
updates = 0
updates += update_nodes
reposition_centroids
end
place_nodes_into_pockets
assign_nodes_to_centriods
end

# This creates an array of arrays
Expand All @@ -51,10 +111,17 @@ def place_nodes_into_pockets
@centroid_pockets = centroid_pockets
end

def update_nodes
def assign_nodes_to_centriods
@nodes.each_with_index do |node, node_index|
#Store closest nodes in the centroid object
node.closest_centroid.nodes << node
end
end

def update_nodes(force = false)
sum = 0
@nodes.each do |node|
sum += node.update_closest_centroid(@centroids)
sum += node.update_closest_centroid(@centroids, force)
end
sum
end
Expand Down
6 changes: 3 additions & 3 deletions lib/k_means/node.rb
Original file line number Diff line number Diff line change
Expand Up @@ -17,10 +17,10 @@ def initialize(position, similarity_measure)
@similarity_measure = similarity_measure
end

def update_closest_centroid(centroids)
def update_closest_centroid(centroids, force = false)
# If we haven't processed this node we need to give it an initial centroid
# so that we have something to compare distances against
calculate_initial_centroid(centroids.first) unless @closest_centroid
calculate_initial_centroid(centroids.first) if (!@closest_centroid || force)

updated = false
centroids.each do |centroid|
Expand Down Expand Up @@ -60,7 +60,7 @@ def calculate_distance(centroid)
begin
@position.send(@similarity_measure, centroid.position)
rescue NoMethodError
raise "Hey, '#{@similarity_measure}' is not a measurement. Read the REAdME for available measurements"
raise "Hey, '#{@similarity_measure}' is not a measurement. Read the README for available measurements"
end
end

Expand Down
69 changes: 58 additions & 11 deletions test/k_means/test_k_means.rb
Original file line number Diff line number Diff line change
@@ -1,6 +1,8 @@
require 'helper'


class TestKMeans < Test::Unit::TestCase

context "A KMeans Instance" do

setup do
Expand All @@ -20,31 +22,76 @@ class TestKMeans < Test::Unit::TestCase
assert_equal 3, @kmeans.nodes.size
end


end

context "A Many tries" do
setup do
@kmeans2 = KMeans.new(data, :centroids => 2, :distance_measure => :euclidean_distance, :max_tries => 50)
@kmeans3 = KMeans.new(data, :centroids => 3, :distance_measure => :euclidean_distance, :max_tries => 50)
end

should "DBI of 3 clusters less than 2 clusters" do
assert @kmeans2.davies_bouldin_index > @kmeans3.davies_bouldin_index
end

end


context "A KMeans Instance with specified initial centroids" do
setup do
@data = Array.new(3) {Array.new(2) {rand}}
class CustomCentroid
attr_accessor :position
def initialize(position); @position = position; end
def reposition(nodes, centroid_positions); end
end

@specified_centroids = @data[0..2].map { |d| CustomCentroid.new(d) }
@kmeans = KMeans.new(@data, :custom_centroids => @specified_centroids, :distance_measure => :cosine_similarity)
#@specified_centroids = @data[0..2].map { |d| CustomCentroid.new(d) }
@kmeans = KMeans.new(@data, :centroids => 3, :distance_measure => :cosine_similarity)
end

should "return an inspected array" do
assert_kind_of String, @kmeans.inspect
assert_kind_of(String, @kmeans.inspect)
end

should "have 3 centroids" do
assert_equal 3, @kmeans.centroids.size
end
assert_equal(3, @kmeans.centroids.size)
end

should "have 3 nodes" do
assert_equal 3, @kmeans.nodes.size
assert_equal(3, @kmeans.nodes.size)
end
end

def data
[
[ 1.0, 1.0 ],
[ 2.0, 1.0 ],
[ 2.0, 1.0 ],
[ 1.5, 1.0 ],
[ 1.8, 1.5 ],
[ 1.2, 1.4 ],
[ 1.9, 2.0 ],
[ 1.9, 1.0 ],
[ 1.0, 1.9 ],
[ 2.0, 2.0 ],
[ 4.3, 1.1 ],
[ 4.8, 1.2 ],
[ 4.3, 1.5 ],
[ 4.4, 1.2 ],
[ 4.1, 1.9 ],
[ 4.8, 1.7 ],
[ 4.2, 1.1 ],
[ 4.4, 1.7 ],
[ 5.8, 5.2 ],
[ 5.2, 6.2 ],
[ 5.5, 6.2 ],
[ 5.1, 6.2 ],
[ 5.2, 6.3 ],
[ 5.8, 6.1 ],
[ 5.6, 6.3 ],
[ 5.3, 6.4 ],
[ 5.5, 6.1 ],
[ 5.1, 6.2 ],
[ 5.9, 6.2 ],
[ 6.2, 5.1 ]
]
end

end