from math import logimport operatordef createDataSet():dataSet = [[0, 0, 0, 0, 'N'],[0, 0, 0, 1, 'N'],[1, 0, 0, 0, 'Y'],[2, 1, 0, 0, 'Y'],[2, 2, 1, 0, 'Y'],[2, 2, 1, 1, 'N'],[1, 2, 1, 1, 'Y']]labels = ['outlook', 'temperature', 'humidity', 'windy']return dataSet, labelsdef calcShannonEnt(dataSet): # 计算熵numEntries = len(dataSet)labelCounts = {}for featVec in dataSet:currentLabel = featVec[-1]if currentLabel not in labelCounts.keys():labelCounts[currentLabel] = 0labelCounts[currentLabel] += 1 # 数每一类各多少个, {'Y': 4, 'N': 3}shannonEnt = 0.0for key in labelCounts:prob = float(labelCounts[key]) / numEntriesshannonEnt -= prob * log(prob, 2)return shannonEntdef chooseBestFeatureToSplit(dataSet):numFeatures = len(dataSet[0]) - 1 # feature个数baseEntropy = calcShannonEnt(dataSet) # 整个dataset的熵bestInfoGainRatio = 0.0bestFeature = -1for i in range(numFeatures):featList = [example[i] for example in dataSet] # 每个feature的listuniqueVals = set(featList) # 每个list的唯一值集合newEntropy = 0.0splitInfo = 0.0for value in uniqueVals:subDataSet = splitDataSet(dataSet, i, value) # 每个唯一值对应的剩余feature的组成子集prob = len(subDataSet) / float(len(dataSet))newEntropy += prob * calcShannonEnt(subDataSet)splitInfo += -prob * log(prob, 2)infoGain = baseEntropy - newEntropy # 这个feature的infoGainif (splitInfo == 0): # fix the overflow bugcontinueinfoGainRatio = infoGain / splitInfo # 这个feature的infoGainRatio增益率if (infoGainRatio > bestInfoGainRatio): # 选择最大的gain ratiobestInfoGainRatio = infoGainRatiobestFeature = i # 选择最大的gain ratio对应的featurereturn bestFeaturedef splitDataSet(dataSet, axis, value):retDataSet = []for featVec in dataSet:if featVec[axis] == value: # 只看当第i列的值=value时的itemreduceFeatVec = featVec[:axis] # featVec的第i列给除去reduceFeatVec.extend(featVec[axis + 1:])retDataSet.append(reduceFeatVec)return retDataSetdef createTree(dataSet, labels):classList = [example[-1] for example in dataSet] # ['N', 'N', 'Y', 'Y', 'Y', 'N', 'Y']if classList.count(classList[0]) == len(classList):# classList所有元素都相等,即类别完全相同,停止划分return classList[0] # splitDataSet(dataSet, 0, 0)此时全是N,返回Nif len(dataSet[0]) == 1: # [0, 0, 0, 0, 'N']# 遍历完所有特征时返回出现次数最多的return majorityCnt(classList)bestFeat = chooseBestFeatureToSplit(dataSet) # 0-> 2# 选择最大的gain ratio对应的featurebestFeatLabel = labels[bestFeat] # outlook -> windymyTree = {bestFeatLabel: {}}# 多重字典构建树{'outlook': {0: 'N'del (labels[bestFeat]) # ['temperature', 'humidity', 'windy'] -> ['temperature', 'humidity']featValues = [example[bestFeat] for example in dataSet] # [0, 0, 1, 2, 2, 2, 1]uniqueVals = set(featValues)for value in uniqueVals:subLabels = labels[:] # ['temperature', 'humidity', 'windy']myTree[bestFeatLabel][value] = createTree(splitDataSet(dataSet, bestFeat, value), subLabels)# 划分数据,为下一层计算准备return myTreedef majorityCnt(classList): # 如果属性完全相同,却不具有相同的类别,则采用少数服从多数的原则进行划分classCount = {}for vote in classList:if vote not in classCount.keys():classCount[vote] = 0else:classCount[vote] += 1sortedClassCount = sorted(classCount.items(), key=operator.itemgetter(1), reverse=True)return sortedClassCount[0][0]
https://github.com/cdqncn/JueCeShu/blob/master/myTree.py
版权声明
本站仅做备份收录,仅供研究与教学参考之用。
读者将信息用于其他用途的,全部法律及连带责任由读者自行承担,本站不承担任何责任。









评论