from math import log
import operator
def createDataset():
dataSet = [[79, 24.7252, 0.818, 9.170, 37.383, yes],
[89, 25.9909, 0.871, 7.561, 24.685, yes],
[70, 25.3934, 1.358, 5.347, 40.620, yes],
[88, 23.2254, 0.714, 7.354, 56.782, yes],
[85, 24.6097, 0.748, 6.760, 58.358, yes],
[68, 25.0762, 0.935, 4.939, 67.123, no],
[70, 19.8839, 1.040, 4.321, 26.399, no],
[69, 25.0593, 1.002, 4.212, 47.515, no],
[74, 25.6544, 0.987, 5.605, 26.132, no],
[79, 19.9594, 0.863, 5.204, 60.267, no],
[76, 22.5981, 0.889, 4.704, 27.026, yes],
[76, 26.4236, 0.886, 5.115, 43.256, no],
[62, 20.3223, 0.889, 5.741, 51.097, yes],
[69, 19.3698, 0.790, 3.880, 49.678, no],
[72, 24.2215, 0.988, 5.844, 41.672, no],
[67, 32.1120, 1.119, 4.160, 60.356, no],
[74, 25.3934, 1.037, 6.728, 40.225, no],
[69, 23.8895, 0.893, 4.203, 27.334, no],
[78, 24.6755, 0.850, 7.347, 28.893, yes],
[71, 27.1314, 0.790, 4.467, 38.173, no],
[74, 23.0518, 0.597, 4.835, 35.141, yes],
[76, 23.4568, 0.889, 5.345, 27.568, yes],
[75, 23.5457, 0.803, 3.773, 36.726, yes],
[70, 23.3234, 0.919, 3.672, 40.093, no],
[69, 22.8625, 0.870, 4.552, 29.627, yes],
[71, 22.0384, 0.811, 4.286, 30.380, no],
[80, 24.6914, 0.859, 5.706, 37.529, yes],
[79, 26.8519, 0.867, 3.563, 43.924, yes],
[72, 27.1809, 0.717, 3.760, 39.714, no],
[78, 23.9512, 0.822, 3.453, 27.294, no],
[80, 28.3874, 1.004, 5.948, 33.376, yes],
[79, 23.5102, 0.738, 4.193, 65.640, no],
[67, 19.7232, 0.865, 4.443, 36.252, yes],
[84, 27.4406, 0.808, 5.482, 33.539, yes],
[78, 28.6661, 0.955, 8.815, 42.398, no],
[65, 23.7812, 0.912, 4.704, 39.254, no],
[70, 23.4493, 0.857, 4.138, 75.947, no],
[67, 25.5354, 0.855, 3.727, 41.851, no],
[74, 24.7409, 0.959, 3.967, 42.293, no],
[73, 22.2291, 1.036, 4.438, 40.222, no],
[74, 34.4753, 1.092, 7.271, 45.434, no],
[68, 32.1929, 0.000, 4.269, 50.841, yes],
[80, 23.3355, 0.759, 4.856, 31.114, no],
[78, 22.7903, 0.757, 4.831, 73.343, no],
[79, 24.6097, 0.671, 4.870, 68.924, yes],
[72, 27.5802, 0.814, 3.021, 27.088, no,]
[67, 30.1205, 1.101, 7.538, 35.487, yes],
[70, 25.8166, 0.818, 3.564, 36.001, no],
[69, 30.4218, 1.088, 3.826, 33.833, no],
[67, 28.7132, 0.934, 3.996, 56.167, no],
[74, 34.5429, 0.969, 6.762, 43.099, no],
[71, 24.6097, 0.794, 4.350, 39.023, no],
[67, 23.5294, 0.830, 3.176, 36.595, no],
[67, 25.6173, 1.057, 3.738, 32.550, no],
[65, 25.3086, 1.160, 3.060, 44.757, no],
[66, 24,8358, 0.811, 3,263, 26.941, no],
[69, 22.3094, 0.977, 3,106, 27.951, no],
[72, 26,5285, 1.063, 6.970, 41.188, no],
[75, 25.8546, 1.091, 4.798, 36.045, no],
[70, 20.6790, 0.741, 3.908, 30.198, no],
[74, 28.3675, 1.045, 4.784, 31.339, no],
[71, 29.0688, 1,066, 4,527, 24.252, no],
[65, 23.9995, 0.841, 3.089, 79.910, no],
[77, 22.9819, 1.015, 4.041, 57.147, no],
[67, 33.3598, 1.129, 7.239, 67.103, yes],
[66, 27.1314, 1.030, 4.096, 29.435, no],
[70, 24.7676, 0,896, 4.352, 44.291, no],
[70, 24.4193, 1.106, 2.823, 37.348, no]]
labels = ['age', 'bmi', 'bmd', 'ictp', 'pinp']
# change to discrete values
return dataSet, labels
def calcShannonEnt(dataSet):
numEntries = len(dataSet)
labelCounts = {}
for featVec in dataSet: # the the number of unique elements and their occurance
currentLabel = featVec[-1]
if currentLabel not in labelCounts.keys(): labelCounts[currentLabel] = 0
labelCounts[currentLabel] += 1
shannonEnt = 0.0
for key in labelCounts:
prob = float(labelCounts[key]) / numEntries
shannonEnt -= prob * log(prob, 2) # log base 2
return shannonEnt
def splitDataSet(dataSet, axis, value):
retDataSet = []
for featVec in dataSet:
if featVec[axis] == value:
reducedFeatVec = featVec[:axis] # chop out axis used for splitting
reducedFeatVec.extend(featVec[axis + 1:])
retDataSet.append(reducedFeatVec)
return retDataSet
def chooseBestFeatureToSplit(dataSet):
numFeatures = len(dataSet[0]) - 1 # the last column is used for the labels
baseEntropy = calcShannonEnt(dataSet)
bestInfoGain = 0.0;
bestFeature = -1
for i in range(numFeatures): # iterate over all the features
featList = [example[i] for example in dataSet] # create a list of all the examples of this feature
uniqueVals = set(featList) # get a set of unique values
newEntropy = 0.0
for value in uniqueVals:
subDataSet = splitDataSet(dataSet, i, value)
prob = len(subDataSet) / float(len(dataSet))
newEntropy += prob * calcShannonEnt(subDataSet)
infoGain = baseEntropy - newEntropy # calculate the info gain; ie reduction in entropy
"""
print("feature : " + str(i))
print("baseEntropy : "+str(baseEntropy))
print("newEntropy : " + str(newEntropy))
print("infoGain : " + str(infoGain))
"""
if (infoGain > bestInfoGain): # compare this to the best gain so far
bestInfoGain = infoGain # if better than current best, set to best
bestFeature = i
return bestFeature # returns an integer
def majorityCnt(classList):
classCount = {}
for vote in classList:
if vote not in classCount.keys(): classCount[vote] = 0
classCount[vote] += 1
sortedClassCount = sorted(classCount.iteritems(), key=operator.itemgetter(1), reverse=True)
return sortedClassCount[0][0]
def createTree(dataSet, labels):
# extracting data
classList = [example[-1] for example in dataSet]
if classList.count(classList[0]) == len(classList):
return classList[0] # stop splitting when all of the classes are equal
if len(dataSet[0]) == 1: # stop splitting when there are no more features in dataSet
return majorityCnt(classList)
# use Information Gain
bestFeat = chooseBestFeatureToSplit(dataSet)
bestFeatLabel = labels[bestFeat]
#build a tree recursively
myTree = {bestFeatLabel: {}}
#print("myTree : "+labels[bestFeat])
del (labels[bestFeat])
featValues = [example[bestFeat] for example in dataSet]
#print("featValues: "+str(featValues))
uniqueVals = set(featValues)
#print("uniqueVals: " + str(uniqueVals))
for value in uniqueVals:
subLabels = labels[:] # copy all of labels, so trees don't mess up existing labels
#print("subLabels"+str(subLabels))
myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels)
#print("myTree : " + str(myTree))
return myTree
def classify(inputTree, featLabels, testVec):
firstStr = inputTree.keys()[0]
#print("fistStr : "+firstStr)
secondDict = inputTree[firstStr]
#print("secondDict : " + str(secondDict))
featIndex = featLabels.index(firstStr)
#print("featIndex : " + str(featIndex))
key = testVec[featIndex]
#print("key : " + str(key))
valueOfFeat = secondDict[key]
#print("valueOfFeat : " + str(valueOfFeat))
if isinstance(valueOfFeat, dict):
#print("is instance: "+str(valueOfFeat))
classLabel = classify(valueOfFeat, featLabels, testVec)
else:
#print("is Not instance: " + valueOfFeat)
classLabel = valueOfFeat
return classLabel
def storeTree(inputTree, filename):
import pickle
fw = open(filename, 'w')
pickle.dump(inputTree, fw)
fw.close()
def grabTree(filename):
import pickle
fr = open(filename)
return pickle.load(fr)
# collect data
myDat, labels = createDataSet()
#build a tree
mytree = createTree(myDat, labels)
print(mytree)
#run test
answer = classifi(mytree, ['age', 'bmi', 'bmd', 'ictp', 'pinp'], [79, 24.7252, 0.818, 9.170, 37.383])
print(" Ban "+answer+" bi nguy co gay xuong")
đoạn code của m bị lôi:
Traceback (most recent call last):
File "C:\Users\phung\Desktop\ve.py", line 198, in <module>
myDat, labels = createDataSet()
NameError: name 'createDataSet' is not defined
M.n xem giùm m với ạ