本文共 7641 字,大约阅读时间需要 25 分钟。
特征是用户的消费商品的消费金额,原始数据(部分)是这样的:
id,goods_name,goods_amount1,男士手袋,1882.02,淑女装,2491.03,淑女装,2492.02,女士手袋,345.04,基础内衣,328.05,商务正装,4985.05,时尚,969.05,女饰品,86.06,专业运动,399.06,童装(中大童),2033.06,男士配件,38.0
看到同一个id下面有不同的消费记录,这个数据不能直接拿来用,写了python程序来进行处理datadeal.py:
# !/usr/bin/python#coding:utf-8#author:wuyy'''数据预处理'''import pandas as pdimport numpy as npimport timeimport re#加载文件x=pd.read_table('info.txt',sep = ",")x=x.dropna(axis=0)a1=list(x.iloc[:,0])a2=list(x.iloc[:,1])a3=list(x.iloc[:,2])print("数据表:",x)#A是商品类别dicta=dict(zip(a2,zip(a1,a3)))print("dicta:",dicta)A=list(dicta.keys())#B是用户idB=list(set(a1))#创建商品类别字典a = np.arange(len(A))lista = list(a)dict_class = dict(zip(A,lista))#将商品分类写入f=open('class.txt','w')for k ,v in dict_class.items(): f.write(str(k)+'\t'+str(v)+'\n')f.close()start=time.clock()#创建大字典存储数据dictall = {}for i in range(len(a1)): if a1[i] in dictall.keys(): value = dictall[a1[i]] j = dict_class[a2[i]] value[j] = a3[i] dictall[a1[i]]=value else: value = list(np.zeros(len(A))) j = dict_class[a2[i]] value[j] = a3[i] dictall[a1[i]]=valueprint('dictall:',dictall)#将字典转化为dataframedictall1 = pd.DataFrame(dictall)dictall_matrix = dictall1.Tprint("dictall_matrix:",dictall_matrix)dictall_matrix.to_csv("data_matrix.txt",index=True,header=None)# fw2=open("dictall_matrix.txt",'w')# fw2.write(dictall_matrix)# fw2.close()dictall_matrixend = time.clock()print ("赋值过程运行时间是:%f s"%(end-start))df=pd.DataFrame(columns=['id','id1'])df[id]=1print(df)
数据处理完成之后,进行AutoEncoder编码AE.py,
# !/usr/bin/python#coding:utf-8#author:wuyy'''AE模型(Auto-encoder)主要是能够把数据缩放,如果你输入的维数比较大,譬如实际的特征是几千维的,全部拿到算法里跑,效果不见得好,因为并不是所有特征都是有用的,用AE模型后,你可以压缩成m维(就是隐含层的节点数)'''import pandas as pdimport numpy as npimport matplotlib.pyplot as pltfrom sklearn import preprocessingclass AutoEncoder(): """ Auto Encoder layer 1 2 ... ... L-1 L W 0 1 ... ... L-2 B 0 1 ... ... L-2 Z 0 1 ... L-3 L-2 A 0 1 ... L-3 L-2 """ def __init__(self, X, Y, nNodes): # training samples self.X = X self.Y = Y # number of samples self.M = len(self.X) # layers of networks self.nLayers = len(nNodes) # nodes at layers self.nNodes = nNodes # parameters of networks self.W = list() self.B = list() self.dW = list() self.dB = list() self.A = list() self.Z = list() self.delta = list() for iLayer in range(self.nLayers - 1): self.W.append( np.random.rand(nNodes[iLayer] * nNodes[iLayer + 1]).reshape(nNodes[iLayer], nNodes[iLayer + 1])) self.B.append(np.random.rand(nNodes[iLayer + 1])) self.dW.append(np.zeros([nNodes[iLayer], nNodes[iLayer + 1]])) self.dB.append(np.zeros(nNodes[iLayer + 1])) self.A.append(np.zeros(nNodes[iLayer + 1])) self.Z.append(np.zeros(nNodes[iLayer + 1])) self.delta.append(np.zeros(nNodes[iLayer + 1])) # value of cost function self.Jw = 0.0 # active function (logistic function) self.sigmod = lambda z: 1.0 / (1.0 + np.exp(-z)) # learning rate 1.2 self.alpha = 2.5 # steps of iteration 30000 self.steps = 10000 def BackPropAlgorithm(self): # clear values self.Jw -= self.Jw for iLayer in range(self.nLayers - 1): self.dW[iLayer] -= self.dW[iLayer] self.dB[iLayer] -= self.dB[iLayer] # propagation (iteration over M samples) for i in range(self.M): # Forward propagation for iLayer in range(self.nLayers - 1): if iLayer == 0: # first layer self.Z[iLayer] = np.dot(self.X[i], self.W[iLayer]) else: self.Z[iLayer] = np.dot(self.A[iLayer - 1], self.W[iLayer]) self.A[iLayer] = self.sigmod(self.Z[iLayer] + self.B[iLayer]) # Back propagation for iLayer in range(self.nLayers - 1)[::-1]: # reserve if iLayer == self.nLayers - 2: # last layer self.delta[iLayer] = -(self.X[i] - self.A[iLayer]) * (self.A[iLayer] * (1 - self.A[iLayer])) self.Jw += np.dot(self.Y[i] - self.A[iLayer], self.Y[i] - self.A[iLayer]) / self.M else: self.delta[iLayer] = np.dot(self.W[iLayer].T, self.delta[iLayer + 1]) * ( self.A[iLayer] * (1 - self.A[iLayer])) # calculate dW and dB if iLayer == 0: self.dW[iLayer] += self.X[i][:, np.newaxis] * self.delta[iLayer][:, np.newaxis].T else: self.dW[iLayer] += self.A[iLayer - 1][:, np.newaxis] * self.delta[iLayer][:, np.newaxis].T self.dB[iLayer] += self.delta[iLayer] # update for iLayer in range(self.nLayers - 1): self.W[iLayer] -= (self.alpha / self.M) * self.dW[iLayer] self.B[iLayer] -= (self.alpha / self.M) * self.dB[iLayer] def PlainAutoEncoder(self): for i in range(self.steps): self.BackPropAlgorithm() print("step:%d" % i, "Jw=%f" % self.Jw) def ValidateAutoEncoder(self): a = np.array([i for i in range(1, 6)]) df = pd.DataFrame(a, columns=['weidu']) for i in range(self.M): print( self.X[i]) for iLayer in range(self.nLayers - 1): if iLayer == 0: # input layer self.Z[iLayer] = np.dot(self.X[i], self.W[iLayer]) else: self.Z[iLayer] = np.dot(self.A[iLayer - 1], self.W[iLayer]) self.A[iLayer] = self.sigmod(self.Z[iLayer] + self.B[iLayer]) print("\t layer=%d" % iLayer, self.A[iLayer]) if iLayer==0: df[str(i+1)]=self.A[iLayer] df.to_csv("jaingwei.txt",index=False)data = []index = []f = open('./data_matrix.txt', 'r')for line in f.readlines(): ss = line.replace('\n', '').split(',') index.append(ss[0]) ss1 = ss[1:] tmp = [] for i in range(len(ss1)): tmp.append(float(ss1[i])) data.append(tmp)f.close()x = np.array(data)# 归一化处理xx = preprocessing.scale(x)nNodes = np.array([10, 5, 10])ae3 = AutoEncoder(xx, xx, nNodes)ae3.PlainAutoEncoder()ae3.ValidateAutoEncoder()print("ae结果:",ae3.A[0])# # 这是个例子,输出的结果也是这个# xx = np.array([[0,0,0,0,0,0,0,1], [0,0,0,0,0,0,1,0], [0,0,0,0,0,1,0,0], [0,0,0,0,1,0,0,0],[0,0,0,1,0,0,0,0], [0,0,1,0,0,0,0,0]])# nNodes = np.array([ 8, 3, 8 ])# ae2 = AutoEncoder(xx,xx,nNodes)# ae2.PlainAutoEncoder()# ae2.ValidateAutoEncoder()
使用sklearn 的Kmeans 进行聚类
# !/usr/bin/python# coding:utf-8# Author :wuyyfrom matplotlib import pyplotimport scipy as spimport numpy as npimport matplotlib.pyplot as pltfrom sklearn.cluster import KMeansfrom scipy import sparseimport pandas as pdfrom sklearn import preprocessingfrom sklearn.preprocessing import StandardScalerfrom sklearn import metricsimport picklefrom sklearn.externals import joblib#加载数据data = pd.read_table('jaingwei.txt',sep = ",")data=data.Tx = data.ix[1:,0:5]print(x)card = data.ix[:,0]x1 = np.array(x)print("x1:",x1)xx = preprocessing.scale(x1)print("preprocessing.scale xx:",xx)num_clusters = 3clf = KMeans(n_clusters=num_clusters, n_init=1, n_jobs = 1,verbose=1) #job=-1 并行化处理clf.fit(xx)print("label:",clf.labels_)labels = clf.labels_#score是轮廓系数score = metrics.silhouette_score(xx, labels)# clf.inertia_用来评估簇的个数是否合适,距离越小说明簇分的越好print ("clf.inertia_",clf.inertia_)print (score)
github地址:
转载自: