博客
关于我
强烈建议你试试无所不能的chatGPT,快点击我
KMeans与深度学习自编码AutoEncoder结合提高聚类效果
阅读量:4210 次
发布时间:2019-05-26

本文共 7641 字,大约阅读时间需要 25 分钟。

特征是用户的消费商品的消费金额,原始数据(部分)是这样的:

id,goods_name,goods_amount1,男士手袋,1882.02,淑女装,2491.03,淑女装,2492.02,女士手袋,345.04,基础内衣,328.05,商务正装,4985.05,时尚,969.05,女饰品,86.06,专业运动,399.06,童装(中大童),2033.06,男士配件,38.0

看到同一个id下面有不同的消费记录,这个数据不能直接拿来用,写了python程序来进行处理datadeal.py:

# !/usr/bin/python#coding:utf-8#author:wuyy'''数据预处理'''import pandas as pdimport  numpy as npimport timeimport re#加载文件x=pd.read_table('info.txt',sep = ",")x=x.dropna(axis=0)a1=list(x.iloc[:,0])a2=list(x.iloc[:,1])a3=list(x.iloc[:,2])print("数据表:",x)#A是商品类别dicta=dict(zip(a2,zip(a1,a3)))print("dicta:",dicta)A=list(dicta.keys())#B是用户idB=list(set(a1))#创建商品类别字典a = np.arange(len(A))lista = list(a)dict_class = dict(zip(A,lista))#将商品分类写入f=open('class.txt','w')for k ,v in dict_class.items():     f.write(str(k)+'\t'+str(v)+'\n')f.close()start=time.clock()#创建大字典存储数据dictall = {}for i in range(len(a1)):    if a1[i] in dictall.keys():        value = dictall[a1[i]]        j = dict_class[a2[i]]        value[j] = a3[i]        dictall[a1[i]]=value    else:        value = list(np.zeros(len(A)))        j = dict_class[a2[i]]        value[j] = a3[i]        dictall[a1[i]]=valueprint('dictall:',dictall)#将字典转化为dataframedictall1 = pd.DataFrame(dictall)dictall_matrix = dictall1.Tprint("dictall_matrix:",dictall_matrix)dictall_matrix.to_csv("data_matrix.txt",index=True,header=None)# fw2=open("dictall_matrix.txt",'w')# fw2.write(dictall_matrix)# fw2.close()dictall_matrixend = time.clock()print ("赋值过程运行时间是:%f s"%(end-start))df=pd.DataFrame(columns=['id','id1'])df[id]=1print(df)

数据处理完成之后,进行AutoEncoder编码AE.py,

# !/usr/bin/python#coding:utf-8#author:wuyy'''AE模型(Auto-encoder)主要是能够把数据缩放,如果你输入的维数比较大,譬如实际的特征是几千维的,全部拿到算法里跑,效果不见得好,因为并不是所有特征都是有用的,用AE模型后,你可以压缩成m维(就是隐含层的节点数)'''import pandas as pdimport numpy as npimport matplotlib.pyplot as pltfrom sklearn import preprocessingclass AutoEncoder():    """ Auto Encoder    layer      1     2    ...    ...    L-1    L      W        0     1    ...    ...    L-2      B        0     1    ...    ...    L-2      Z              0     1     ...    L-3    L-2      A              0     1     ...    L-3    L-2    """    def __init__(self, X, Y, nNodes):        # training samples        self.X = X        self.Y = Y        # number of samples        self.M = len(self.X)        # layers of networks        self.nLayers = len(nNodes)        # nodes at layers        self.nNodes = nNodes        # parameters of networks        self.W = list()        self.B = list()        self.dW = list()        self.dB = list()        self.A = list()        self.Z = list()        self.delta = list()        for iLayer in range(self.nLayers - 1):            self.W.append(                np.random.rand(nNodes[iLayer] * nNodes[iLayer + 1]).reshape(nNodes[iLayer], nNodes[iLayer + 1]))            self.B.append(np.random.rand(nNodes[iLayer + 1]))            self.dW.append(np.zeros([nNodes[iLayer], nNodes[iLayer + 1]]))            self.dB.append(np.zeros(nNodes[iLayer + 1]))            self.A.append(np.zeros(nNodes[iLayer + 1]))            self.Z.append(np.zeros(nNodes[iLayer + 1]))            self.delta.append(np.zeros(nNodes[iLayer + 1]))        # value of cost function        self.Jw = 0.0        # active function (logistic function)        self.sigmod = lambda z: 1.0 / (1.0 + np.exp(-z))        # learning rate 1.2        self.alpha = 2.5        # steps of iteration 30000        self.steps = 10000    def BackPropAlgorithm(self):        # clear values        self.Jw -= self.Jw        for iLayer in range(self.nLayers - 1):            self.dW[iLayer] -= self.dW[iLayer]            self.dB[iLayer] -= self.dB[iLayer]        # propagation (iteration over M samples)        for i in range(self.M):            # Forward propagation            for iLayer in range(self.nLayers - 1):                if iLayer == 0:  # first layer                    self.Z[iLayer] = np.dot(self.X[i], self.W[iLayer])                else:                    self.Z[iLayer] = np.dot(self.A[iLayer - 1], self.W[iLayer])                self.A[iLayer] = self.sigmod(self.Z[iLayer] + self.B[iLayer])                # Back propagation            for iLayer in range(self.nLayers - 1)[::-1]:  # reserve                if iLayer == self.nLayers - 2:  # last layer                    self.delta[iLayer] = -(self.X[i] - self.A[iLayer]) * (self.A[iLayer] * (1 - self.A[iLayer]))                    self.Jw += np.dot(self.Y[i] - self.A[iLayer], self.Y[i] - self.A[iLayer]) / self.M                else:                    self.delta[iLayer] = np.dot(self.W[iLayer].T, self.delta[iLayer + 1]) * (                    self.A[iLayer] * (1 - self.A[iLayer]))                # calculate dW and dB                if iLayer == 0:                    self.dW[iLayer] += self.X[i][:, np.newaxis] * self.delta[iLayer][:, np.newaxis].T                else:                    self.dW[iLayer] += self.A[iLayer - 1][:, np.newaxis] * self.delta[iLayer][:, np.newaxis].T                self.dB[iLayer] += self.delta[iLayer]                # update        for iLayer in range(self.nLayers - 1):            self.W[iLayer] -= (self.alpha / self.M) * self.dW[iLayer]            self.B[iLayer] -= (self.alpha / self.M) * self.dB[iLayer]    def PlainAutoEncoder(self):        for i in range(self.steps):            self.BackPropAlgorithm()            print("step:%d" % i, "Jw=%f" % self.Jw)    def ValidateAutoEncoder(self):        a = np.array([i for i in range(1, 6)])        df = pd.DataFrame(a, columns=['weidu'])        for i in range(self.M):            print( self.X[i])            for iLayer in range(self.nLayers - 1):                if iLayer == 0:  # input layer                    self.Z[iLayer] = np.dot(self.X[i], self.W[iLayer])                else:                    self.Z[iLayer] = np.dot(self.A[iLayer - 1], self.W[iLayer])                self.A[iLayer] = self.sigmod(self.Z[iLayer] + self.B[iLayer])                print("\t layer=%d" % iLayer, self.A[iLayer])                if iLayer==0:                    df[str(i+1)]=self.A[iLayer]        df.to_csv("jaingwei.txt",index=False)data = []index = []f = open('./data_matrix.txt', 'r')for line in f.readlines():    ss = line.replace('\n', '').split(',')    index.append(ss[0])    ss1 = ss[1:]    tmp = []    for i in range(len(ss1)):        tmp.append(float(ss1[i]))    data.append(tmp)f.close()x = np.array(data)# 归一化处理xx = preprocessing.scale(x)nNodes = np.array([10, 5, 10])ae3 = AutoEncoder(xx, xx, nNodes)ae3.PlainAutoEncoder()ae3.ValidateAutoEncoder()print("ae结果:",ae3.A[0])# # 这是个例子,输出的结果也是这个# xx = np.array([[0,0,0,0,0,0,0,1], [0,0,0,0,0,0,1,0], [0,0,0,0,0,1,0,0], [0,0,0,0,1,0,0,0],[0,0,0,1,0,0,0,0], [0,0,1,0,0,0,0,0]])# nNodes = np.array([ 8, 3, 8 ])# ae2 = AutoEncoder(xx,xx,nNodes)# ae2.PlainAutoEncoder()# ae2.ValidateAutoEncoder()

使用sklearn 的Kmeans 进行聚类

# !/usr/bin/python# coding:utf-8# Author :wuyyfrom matplotlib import pyplotimport scipy as spimport numpy as npimport matplotlib.pyplot as pltfrom sklearn.cluster   import KMeansfrom scipy import sparseimport pandas as pdfrom sklearn import preprocessingfrom sklearn.preprocessing import StandardScalerfrom sklearn import metricsimport picklefrom sklearn.externals import joblib#加载数据data = pd.read_table('jaingwei.txt',sep = ",")data=data.Tx = data.ix[1:,0:5]print(x)card = data.ix[:,0]x1 = np.array(x)print("x1:",x1)xx = preprocessing.scale(x1)print("preprocessing.scale xx:",xx)num_clusters = 3clf = KMeans(n_clusters=num_clusters,  n_init=1, n_jobs = 1,verbose=1) #job=-1 并行化处理clf.fit(xx)print("label:",clf.labels_)labels = clf.labels_#score是轮廓系数score = metrics.silhouette_score(xx, labels)# clf.inertia_用来评估簇的个数是否合适,距离越小说明簇分的越好print ("clf.inertia_",clf.inertia_)print (score)

github地址:

转载自:

你可能感兴趣的文章
JQuery 简介
查看>>
Java创建对象的方法
查看>>
Extjs自定义组件
查看>>
TreeGrid 异步加载节点
查看>>
Struts2 标签库讲解
查看>>
Google Web工具包 GWT
查看>>
材料与工程学科相关软件
查看>>
windows 下AdNDP 安装使用
查看>>
Project 2013项目管理教程(1):项目管理概述及预备
查看>>
ssh客户端后台运行
查看>>
哥去求职,才说了一句话考官就让我出去
查看>>
【React Native】把现代web科技带给移动开发者(一)
查看>>
【GoLang】Web工作方式
查看>>
Launch Sublime Text 3 from the command line
查看>>
【数据库之mysql】mysql的安装(一)
查看>>
【数据库之mysql】 mysql 入门教程(二)
查看>>
【HTML5/CSS/JS】A list of Font Awesome icons and their CSS content values(一)
查看>>
【HTML5/CSS/JS】<br>与<p>标签区别(二)
查看>>
【HTML5/CSS/JS】开发跨平台应用工具的选择(三)
查看>>
【心灵鸡汤】Give it five minutes不要让一个好主意随风而去
查看>>