# -*- coding: utf-8 -*-
"""
Created on Tue May 23 10:27:15 2017
@author: phung
"""
from math import log
import operator
def createDataSet():
dataSet = [[79, 24.7252, 0.818, 9.170, 37.383, 1],
[89, 25.9909, 0.871, 7.561, 24.685, 1],
[70, 25.3934, 1.358, 5.347, 40.620, 1],
[88, 23.2254, 0.714, 7.354, 56.782, 1],
[85, 24.6097, 0.748, 6.760, 58.358, 1],
[68, 25.0762, 0.935, 4.939, 67.123, 0],
[70, 19.8839, 1.040, 4.321, 26.399, 0],
[69, 25.0593, 1.002, 4.212, 47.515, 0],
[74, 25.6544, 0.987, 5.605, 26.132, 0],
[79, 19.9594, 0.863, 5.204, 60.267, 0],
[76, 22.5981, 0.889, 4.704, 27.026, 1],
[76, 26.4236, 0.886, 5.115, 43.256, 0],
[62, 20.3223, 0.889, 5.741, 51.097, 1],
[69, 19.3698, 0.790, 3.880, 49.678, 0],
[72, 24.2215, 0.988, 5.844, 41.672, 0],
[67, 32.1120, 1.119, 4.160, 60.356, 0],
[74, 25.3934, 1.037, 6.728, 40.225, 0],
[69, 23.8895, 0.893, 4.203, 27.334, 0],
[78, 24.6755, 0.850, 7.347, 28.893, 1],
[71, 27.1314, 0.790, 4.467, 38.173, 0],
[74, 23.0518, 0.597, 4.835, 35.141, 1],
[76, 23.4568, 0.889, 5.345, 27.568, 1],
[75, 23.5457, 0.803, 3.773, 36.726, 1],
[70, 23.3234, 0.919, 3.672, 40.093, 0],
[69, 22.8625, 0.870, 4.552, 29.627, 1],
[71, 22.0384, 0.811, 4.286, 30.380, 0],
[80, 24.6914, 0.859, 5.706, 37.529, 1],
[79, 26.8519, 0.867, 3.563, 43.924, 1],
[72, 27.1809, 0.717, 3.760, 39.714, 0],
[78, 23.9512, 0.822, 3.453, 27.294, 0],
[80, 28.3874, 1.004, 5.948, 33.376, 1],
[79, 23.5102, 0.738, 4.193, 65.640, 0],
[67, 19.7232, 0.865, 4.443, 36.252, 1],
[84, 27.4406, 0.808, 5.482, 33.539, 1],
[78, 28.6661, 0.955, 8.815, 42.398, 0],
[65, 23.7812, 0.912, 4.704, 39.254, 0],
[70, 23.4493, 0.857, 4.138, 75.947, 0],
[67, 25.5354, 0.855, 3.727, 41.851, 0],
[74, 24.7409, 0.959, 3.967, 42.293, 0],
[73, 22.2291, 1.036, 4.438, 40.222, 0],
[74, 34.4753, 1.092, 7.271, 45.434, 0],
[68, 32.1929, 0.000, 4.269, 50.841, 1],
[80, 23.3355, 0.759, 4.856, 31.114, 0],
[78, 22.7903, 0.757, 4.831, 73.343, 0],
[79, 24.6097, 0.671, 4.870, 68.924, 1],
[72, 27.5802, 0.814, 3.021, 27.088, 0],
[67, 30.1205, 1.101, 7.538, 35.487, 1],
[70, 25.8166, 0.818, 3.564, 36.001, 0],
[69, 30.4218, 1.088, 3.826, 33.833, 0],
[67, 28.7132, 0.934, 3.996, 56.167, 0],
[74, 34.5429, 0.969, 6.762, 43.099, 0],
[71, 24.6097, 0.794, 4.350, 39.023, 0],
[67, 23.5294, 0.830, 3.176, 36.595, 0],
[67, 25.6173, 1.057, 3.738, 32.550, 0],
[65, 25.3086, 1.160, 3.060, 44.757, 0],
[66, 24,8358, 0.811, 3,263, 26.941, 0],
[69, 22.3094, 0.977, 3,106, 27.951, 0],
[72, 26,5285, 1.063, 6.970, 41.188, 0],
[75, 25.8546, 1.091, 4.798, 36.045, 0],
[70, 20.6790, 0.741, 3.908, 30.198, 0],
[74, 28.3675, 1.045, 4.784, 31.339, 0],
[71, 29.0688, 1,066, 4,527, 24.252, 0],
[65, 23.9995, 0.841, 3.089, 79.910, 0],
[77, 22.9819, 1.015, 4.041, 57.147, 0],
[67, 33.3598, 1.129, 7.239, 67.103, 1],
[66, 27.1314, 1.030, 4.096, 29.435, 0],
[70, 24.7676, 0,896, 4.352, 44.291, 0],
[70, 24.4193, 1.106, 2.823, 37.348, 0]]
labels = ['age', 'bmi', 'bmd', 'ictp', 'pinp']
# change to discrete values
return dataSet, labels
def calcShannonEnt(dataSet):
numEntries = len(dataSet)
labelCounts = {}
for featVec in dataSet: # the the number of unique elements and their occurance
currentLabel = featVec[-1]
if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
shannonEnt = 0.0
for key in labelCounts:
prob = float(labelCounts[key]) / numEntries
shannonEnt -= prob * log(prob, 2) # log base 2
return shannonEnt
def splitDataSet(dataSet, axis, value):
retDataSet = []
for featVec in dataSet:
if featVec[axis] == value:
reducedFeatVec = featVec[:axis] # chop out axis used for splitting
reducedFeatVec.extend(featVec[axis + 1:])
retDataSet.append(reducedFeatVec)
return retDataSet
def chooseBestFeatureToSplit(dataSet):
numFeatures = len(dataSet[0]) - 1 # the last column is used for the labels
baseEntropy = calcShannonEnt(dataSet)
bestInfoGain = 0.0;
bestFeature = -1
for i in range(numFeatures): # iterate over all the features
featList = [example[i] for example in dataSet] # create a list of all the examples of this feature
uniqueVals = set(featList) # get a set of unique values
newEntropy = 0.0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value)
prob = len(subDataSet) / float(len(dataSet))
newEntropy += prob * calcShannonEnt(subDataSet)
infoGain = baseEntropy - newEntropy # calculate the info gain; ie reduction in entropy
"""
print("feature : " + i)
print("baseEntropy : "+baseEntropy)
print("newEntropy : " + newEntropy)
print("infoGain : " + infoGain)
"""
if (infoGain > bestInfoGain): # compare this to the best gain so far
bestInfoGain = infoGain # if better than current best, set to best
bestFeature = i
return bestFeature # returns an integer
def majorityCnt(classList):
classCount = {}
for vote in classList:
if vote not in classCount.keys(): classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
def createTree(dataSet, labels):
# extracting data
classList = [example[-1] for example in dataSet]
if classList.count(classList[0]) == len(classList):
return classList[0] # stop splitting when all of the classes are equal
if len(dataSet[0]) == 1: # stop splitting when there are no more features in dataSet
return majorityCnt(classList)
# use Information Gain
bestFeat = chooseBestFeatureToSplit(dataSet)
bestFeatLabel = labels[bestFeat]
#build a tree recursively
myTree = {bestFeatLabel: {}}
#print("myTree : "+labels[bestFeat])
del (labels[bestFeat])
featValues = [example[bestFeat] for example in dataSet]
uniqueVals = set(featValues)
for value in uniqueVals:
subLabels = labels[:] # copy all of labels, so trees don't mess up existing labels
myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels)
return myTree
def classify(inputTree, featLabels, testVec):
firstStr = inputTree.keys()[0]
#print("fisrtStr : "+firstStr)
secondDict = inputTree[first]
#print("secondDict : " + secondDict)
featIndex = featLabels.index(firstStr)
#print("featIndex : " + featIndex)
key = testVec[featIndex]
#print("key : " + key)
valueOfFeat = secondDict[key]
#print("valueOfFeat : " + valueOfFeat)
if isinstance(valueOfFeat, dict):
#print("is instance: "+valueOfFeat)
classLabel = classify(valueOfFeat, featLabels, testVec)
else:
#print("is Not instance: " + valueOfFeat)
classLabel = valueOfFeat
return classLabel
def storeTree(inputTree, filename):
import pickle
fw = open(filename, 'w')
pickle.dump(inputTree, fw)
fw.close()
def grabTree(filename):
import pickle
fr = open(filename)
return pickle.load(fr)
# collect data
myDat, labels = createDataSet()
#build a tree
mytree = createTree(myDat, labels)
print(mytree)
#run test
# test
answer = classify(mytree, ['age', 'bmi', 'bmd', 'ictp', 'pinp'], [70, 24.4193, 1.106, 2.823, 37.348, 0])
print ("Ban "+ answer + "bi nguy co gay xuong")
bị lỗi(m dùng python 2.7)
Traceback (most recent call last):
File "C:\Users\phung\Desktop\ve.py", line 217, in <module>
print ("Hi, the answer is "+ answer + ", it is winter family photo")
TypeError: cannot concatenate 'str' and 'int' objects
>>>
Update: e ms sửa đk lỗi dict r ạ. h nó lại ghi lỗi :m.n giúp e với ạ
File "C:/Users/phung/untitled1.py", line 92, in calcShannonEnt
prob = float(labelCounts.keys) / numEntries
TypeError: float() argument must be a string or a number