Kmeans base on Cython

背景

Kmeans 是机器学习比较基础的算法,利用包调用比较容易,未来的算法可以很复杂,但基础都是一样简单的。算法层面尽量写得简单,将优化过程尽量写复杂。由于想使用Cython,先写C++部分,这里需要定义命名空间。头文件代码如下:

代码

KMeans.h

#ifndef KMEANS
#define KMEANS

#include <iostream>
#include <Eigen/Dense>

using namespace std;
using namespace Eigen;

namespace cluster {
    class Kmeans{
    public:
        size_t n_cluster;
        int max_iter;
        double tol;
        int n_jobs;
        MatrixXd centers;
        vector<int> label;
        Kmeans();
        Kmeans(int n,int mi,double to,int jobs);
        ~Kmeans();
        void fit(const Eigen::Map<Eigen::MatrixXd> &mat);
        MatrixXd & center();
        vector<int> predict(const Eigen::Map<Eigen::MatrixXd> & data);
    };
}
#endif

这里使用Eigen库,也是为了将整个项目做的尽量复杂。其次,实现过程中使用一些c++11的特性,简化代码,实现KMeans的代码如下:

KMeans.cpp

#include "KMeans.h"
#include <iostream>
#include <Eigen/Dense>
#include "rand.h"
#include <functional>

using namespace std;
using namespace Eigen;

namespace cluster {
    Kmeans::Kmeans(){//参数借鉴sklearn
        n_cluster=2;
        max_iter=300;
        tol=0.0001;
        n_jobs=1;
    }
    Kmeans::Kmeans(int n,int mi,double to,int jobs){
        n_cluster=n;
        max_iter=mi;
        tol=to;
        n_jobs=jobs;
    }
    Kmeans::~Kmeans(){}
    void Kmeans::fit(const Eigen::Map<Eigen::MatrixXd> &input){
        centers.resize(n_cluster,input.rows());
        MatrixXd temp_centers(n_cluster,input.rows());
        MatrixXd data(input);//according to col
        vector<int>index=CreateRandomNums(data.rows(),n_cluster);
        //自己定义的随机数生成,随机初始化聚类中心
        for(size_t i=0;i<n_cluster;i++){
            temp_centers.col(i)=data.col(index[i]);
        }
        label=vector<int>(input.cols());
    
        int iter=0;
    
        while(//使用lambda函数,简化代码
            [=](){
                size_t tmp_loop=centers.cols();
                for(size_t i=0;i<tmp_loop;i++){
                    double diff=(centers.col(i)-temp_centers.col(i)).norm();
                    if(diff>tol)
                        return 1;
                }
                return 0;
            }() && max_iter>iter){
            centers=temp_centers;
            size_t tmp_loop2=data.cols();
            for(size_t i=0;i<tmp_loop2;i++){
                int minclass=0;
                double dis=9999999999999;
                double temp_dis;
                for(size_t j=0;j<n_cluster;j++){
                    temp_dis=[=](){ return (data.col(i)-centers.col(j)).norm();}();
                    //cout<<temp_dis<<endl;
                    if(temp_dis<dis){
                        dis=temp_dis;
                        minclass=j;
                    }
                }
                label[i]=minclass;
            }
            temp_centers=MatrixXd::Zero(n_cluster,input.rows());
            vector<int> count=vector<int>(n_cluster);

            tmp_loop2=input.cols();
            for(size_t i=0;i<tmp_loop2;i++){
                temp_centers.col(label[i])=(count[label[i]]*temp_centers.col(label[i])+data.col(i))/(count[label[i]]+1);
                count[label[i]]+=1;
            }
            iter+=1;
        }
        cout<<"final iterator times:"<<iter<<endl;
    }
    MatrixXd & Kmeans::center(){//返回聚类中心
        return centers;
    }
    vector<int> Kmeans::predict(const Eigen::Map<Eigen::MatrixXd> & data){
    	//根据聚类中心返回聚类结果
        size_t num=data.cols();
        cout<<num<<endl;
        vector<int> result=vector<int>(num);
        for(size_t i=0;i<num;i++){
            int minclass=0;
                double dis=9999999999999;
                double temp_dis;
                for(size_t j=0;j<n_cluster;j++){
                    temp_dis=[=](){ return (data.col(i)-centers.col(j)).norm();}();
                    //cout<<temp_dis<<endl;
                    if(temp_dis<dis){
                        dis=temp_dis;
                        minclass=j;
                    }
                }
                result[i]=minclass;
        }
        return result;
    }
}

下面是引用c++部分的关键,需要先写.pyx、.pxd文件,这是链接C代码的关键。

cluster.pxd

from libcpp.vector cimport vector
from eigency.core cimport *

cdef extern from "KMeans.cpp":
    pass

# Decalre the class with cdef
cdef extern from "KMeans.h" namespace "cluster":
    cdef cppclass Kmeans:
        Kmeans() except +
        Kmeans(int ,int ,double ,int) except +
        void fit(Map[MatrixXd] &)
        int n_cluster
        int max_iter
        double tol
        int n_jobs
        vector[int] label
        MatrixXd centers
        MatrixXd & center()
        vector[int] predict(Map[MatrixXd] &)

clu.pyx

# cython: language_level=3
# distutils: language = c++

from cluster cimport Kmeans
from libcpp.vector cimport vector

# 这里定义的函数才能作为接口被引用
cdef class KMeans:
    cdef Kmeans c_kmeans  # Hold a C++ instance which we're wrapping

    def __cinit__(self,n_cluster,max_iter,tol,n_jobs):
        self.c_kmeans = Kmeans(n_cluster,max_iter,tol,n_jobs)
    def fit(self, np.ndarray array):
        return self.c_kmeans.fit(Map[MatrixXd](array))

    def center(self):
        return ndarray(self.c_kmeans.center())

    def predict(self,np.ndarray array):
        return self.c_kmeans.predict(Map[MatrixXd](array))

下面是编译的关键文件,这里并没有写完整,而应该根据不同机器调整,原因是需要引入eigen库

setup.py

from distutils.core import setup,Extension
from Cython.Build import cythonize
import eigency

setup(
	ext_modules=cythonize(Extension(
		name='cluster',
		author='jiajun',
		version='0.0.1',
		sources=['clu.pyx'],
		language='c++',
		extra_compile_args=["-std=c++11"],
		include_dirs=[".", "module-dir-name"] + eigency.get_includes(),
		#eigency的路径的配置方式
		#include_dirs=["../eigen3","/Users/XXX/anaconda3/envs/py37/lib/python3.7/site-packages/numpy/core/include"],
		#如果自己安装了eigen,则可以配置自己的路径,numpy引用出错时,需要将其头文件引入
		install_requires=['Cython>=0.2.15','eigency>=1.77'],
		packages=['little-try'],
		python_requires='>=3'
	))
)

编译

最后,手动编译,即可得到想要的库

python setup.py build_ext --inplace

这里补充一个小测试:

from cluster import KMeans

import numpy as np

data=np.random.rand(100,20)

cl=KMeans(2, 300 , 0.0001, 1 )

cl.fit(data)

cl.predict(np.random.rand(3,20))

cl.center()

打包给其他人用

创建源码分发文件

python setup.py sdist

创建二进制分发文件

python setup.py bdist