カタベログ

IT技術に関するブログを書きたい.食べ物関連はInstagramをご参照の事.

kmeansとk-means++のヘッダー

研究に必要だったんで作ってみました.
k-meansは教師なしクラスタリング手法の中で最も有名で最も使われている(と思われる)手法です.
k-meansの問題点としては,k個の代表点の選出がランダムなので,結果にばらつきが生じる事です.
それを改善したのがk-means++です.

#pragma once

#include <cstdlib>
#include <cfloat>
#include <cmath>
#include <iostream>
#include <list>
#include <vector>
#include <map>
#include <boost/numeric/ublas/io.hpp>
#include <boost/numeric/ublas/matrix.hpp>
#include <boost/numeric/ublas/matrix_proxy.hpp>
#include <boost/numeric/ublas/vector.hpp>
#include "rnd_shuffle.h"

using namespace std;
using namespace boost;
using namespace boost::numeric::ublas;

typedef matrix<double> dmat;
typedef matrix_row<dmat> dmat_r;
typedef boost::numeric::ublas::vector<double> boost_dvect;
typedef list<double> dlist;
typedef std::vector<boost_dvect> stl_dvvect;

stl_dvvect kmeans(int , dmat*);
void choose_random_center(int, stl_dvvect*, dmat*);
void choose_smart_center(int, stl_dvvect*, dmat*);
void classification(int, stl_dvvect*, dmat*);

//k-means is prototype-based unsupervised classification algorithm.
//"k" is the variable that contains the number of representation point.
//"dm" is a prototype dataset.
stl_dvvect kmeans(int const k, dmat *dm){
	stl_dvvect center_of_cluster;
	
	//choose_random_center(k, &center_of_cluster, dm);
	choose_smart_center(k, &center_of_cluster, dm);
	
	classification(k, &center_of_cluster, dm);
	
	return center_of_cluster;
}
void choose_random_center(int const k, stl_dvvect* center_of_cluster, dmat* const dm){
	rnd_shuffle rs(dm->size1());
	std::vector<int> rs_vector = rs.getshuffled();
	
	for(int i = 0; i < k; ++i){
		center_of_cluster->push_back(row(*dm, rs_vector.at(i)));
	}
}
void choose_smart_center(int const k, stl_dvvect* center_of_cluster, dmat* const dm){
	rnd_shuffle rs(dm->size1());
	std::vector<int> rs_vector = rs.getshuffled();
	
	center_of_cluster->push_back(row(*dm, rs_vector.at(0)));
	
	for(int i = 1; i < k; ++i){
		int target_index = 0;
		double probability = DBL_MIN;
		double sumD = 0.0, D[dm->size1()];
		
		for(int j = 0; j < (int)dm->size1();++j){
			double	min_dist = DBL_MAX;
			
			for(int num = 0; num < (int)center_of_cluster->size(); ++num){
				min_dist = fmin(min_dist, norm_2(center_of_cluster->at(num) - row(*dm, j)));
			}
			D[j] = min_dist;
			sumD += min_dist * min_dist;
		}
		for(int j = 0; j < (int)dm->size1(); ++j){
			if(probability < fmax(probability, (D[j] * D[j]) / sumD)){
				probability = (D[j] * D[j]) / sumD ;
				target_index = j;
			}
		}
		if(target_index >= 0 && target_index < (int)dm->size1()){
			center_of_cluster->push_back(row(*dm, target_index));
		}else{
			cout << "error occured !\n" << endl;
		}
	}
}
void classification(int const k, stl_dvvect* center_of_cluster, dmat* const dm){
	bool change_class = false;
	map<int, int> index_class_map;
	
	//This part assign all datas to a nearest cluster.
	while(true){
		change_class = false;
		
		for(int i = 0; i < (int)dm->size1(); ++i){
			int class_num = -1;
			double min_dist = DBL_MAX;
			
			for(int j = 0; j <  k; ++j){
				if(fmin(min_dist, norm_2(center_of_cluster->at(j) - row(*dm, i))) < min_dist){
					min_dist = norm_2(center_of_cluster->at(j) - row(*dm, i));
					class_num = j;
				}
			}
			
			if(index_class_map.empty() || (unsigned int) index_class_map.size() < (unsigned int)dm->size1()){
				index_class_map.insert(map<int, int>::value_type(i, class_num));
				change_class = true;
			}else{
				if(index_class_map[i] != class_num){
					index_class_map[i] = class_num;
					change_class = true;
				}
			}
		}
		
		if(!change_class){
			break;
		}
		
		//Compute each center of clusters. 
		center_of_cluster->clear();

		for(int i = 0; i < k; ++i){
			int sumNum = 0;
			boost_dvect sumC;
			sumC.resize(dm->size2());
			
			for(int j = 0; j < (int)dm->size1(); ++j){
				if(index_class_map[j] == i){
					++sumNum;
					sumC += row(*dm, j);
				}
			}
			if(sumNum > 0){
				center_of_cluster->push_back(sumC / (double)sumNum);
			}else{
				center_of_cluster->push_back(center_of_cluster->at(0));
			}
		}	
	}
}

C++/boostでプログラミングしています.
自分が使いやすいように作ったので,分かりづらいのはご容赦願います.

[追記]
オリジナルのヘッダーrnd_shuffle.hを使用している事忘れてました.
これは,メルセンヌ・ツイスターを利用して,0から指定した正の数までの数列をシャッフルするために必要です.

#pragma once

#include <iostream>
#include <ctime>
#include <boost/random.hpp>
template < typename T >

class Increment{
    T val_;
public:
    Increment( const T& init ) : val_( init ){}
    T operator()(){ return val_++; }
};

class Random{
public:
	boost::mt19937 gen;
	boost::uniform_int<int> dst;
	boost::variate_generator< boost::mt19937, boost::uniform_int<int> > rand;
	
	Random( int num ):// call instance:
	gen( static_cast<unsigned long>(std::time(0)) ), dst( 0, num ), rand( gen, dst ) {}
	
	std::ptrdiff_t operator()( std::ptrdiff_t arg ) {
		return static_cast< std::ptrdiff_t >( rand() );
	}
};

class rnd_shuffle{
private:
	int num;
	std::vector<int> vctr;
public:
	rnd_shuffle(int);
	std::vector<int> getshuffled();
};

rnd_shuffle::rnd_shuffle(int num){
	this->num = num;
	Random rnd(num - 1);

	std::generate_n(std::back_inserter(vctr), num, Increment<int>(0));
	std::random_shuffle(vctr.begin(), vctr.end(), rnd);
}

std::vector<int> rnd_shuffle::getshuffled(){
	return vctr;
}