song.yz@foxmail.com wechat: math-box

统计机器学习

附录:朴素贝叶斯方法代码



这里给出源代码,仅供参考。 如对其他代码感兴趣,欢迎与我联系。 这里给出部分算法的源代码。代码由go语言编写。这些代码没有经过详细测试,所以可能有bug。

另外,对一些公开的数据测试中,我发现国外一些文献中同样的算法机器学习性能后比我的识别准确率要高。 因此,程序可能还有一些问题。请谨慎使用。

package main

import (
	"bufio"
	"fmt"
	"io"
	"math"
	"os"
	"strconv"
	"strings"
)

func main() {
	/*------------------------------------------------------
	     Created  :  Song Yezhi   2022-6-13 2:40
	          machine learning  test
	  --------------------------------------------------------
	     Input Parameters   :

	     Output Parameters  :

	  --------------------------------------------------------
	     Email        : song.yz@foxmail.com
	     Copyrigt (C) : Chinese Academy of Sciences
	                    All rights reserved,  2022
	  -------------------------------------------------------*/
	test()

}

func test() {

	finName := "training.txt"
	//tranning data

	catalog, lenAttribute := GetCatalog(finName)
	//tranning
	fmt.Println("lenAttribute = %d", lenAttribute)

	var meanEach [][]float64

	var stdEach [][]float64

	var priorProbability []float64

	meanEach, stdEach, priorProbability = naiveBayesTraining(finName, catalog, lenAttribute)

	debug := 0

	if debug == 1 {
		fmt.Println("meanEach =")
		MatOutput(meanEach)

		fmt.Println("stdEach =")
		MatOutput(stdEach)

		fmt.Println("priorProbability =")
		VecOutput(priorProbability)

	}

	// begin test
	ftestName := "test.txt"
	ftest, _ := os.Open(ftestName)
	defer ftest.Close()

	br := bufio.NewReader(ftest)

	objAttribute := Vec(lenAttribute)

	i := 0
	k := 0
	for {

		line, ioerr := br.ReadString('\n')
		if ioerr == io.EOF {
			break
		}

		// in case of no content
		if strings.TrimSpace(line) == "" {
			continue
		}

		//the comment
		if line[0] == '#' {
			continue
		}

		word := strings.Split(line, ",")

		for i := 0; i < lenAttribute; i++ {
			objAttribute[i], _ = strconv.ParseFloat(strings.TrimSpace(word[i]), 64)
		}

		_, objclass := naiveBayes(objAttribute, meanEach, stdEach, priorProbability, catalog)
		// already get the right class

		i = i + 1
		fmt.Printf("the %d object predicion class is %s , the real object is %s \n", i, objclass, word[lenAttribute])

		if strings.TrimSpace(objclass) == strings.TrimSpace(word[lenAttribute]) {
			k = k + 1
		}

	}

	fmt.Printf("The prediction accuracy is %f \n", float64(k)/float64(i))

}

func naiveBayes(objAttribute []float64, meanEach [][]float64,
	stdEach [][]float64, priorProbability []float64,
	catalog []string) (objFunc []float64, objclass string) {
	/* ------------------------------------------------------
	      Created  :  Song Yezhi   2022-6-13 0:44
	        naiveBayes Classification

	        	naive bayes method

	          Ref:  "data mining and analysis"
	   --------------------------------------------------------
	      Input Parameters   :
	           objAttribute[]  -----  attribute of test object
	           meanEach ------------
	           stdEach ----------
	           priorProbalility ---- prior Probability of each catalog
	           catalog  ----------  catalog in string form

	      Output Parameters  :
	           objclass  ---- the result of whic class  respect to objAttribute
	           objFunc ---- object funtion value
	   --------------------------------------------------------
	      Email        : song.yz@foxmail.com
	      Copyrigt (C) : Chinese Academy of Sciences
	                     All rights reserved,  2022
	   ------------------------------------------------------- */
	nCatalog := len(catalog)

	objFunc = Vec(nCatalog)
	//object function Ref 4.24 "an introduction to statistiacl learning 2nd edtion"
	//   Ref  4.10 "the elements of statistical learning"

	lenAttribute := len(objAttribute)

	pi := 3.141592653589793238462643

	var tmp1 float64
	var tmp2 float64
	var tmp3 float64
	var tmp4 float64

	for k := 0; k < nCatalog; k++ {

		tmp1 = 1.0

		for i := 0; i < lenAttribute; i++ {

			tmp2 = (objAttribute[i] - meanEach[k][i]) * (objAttribute[i] - meanEach[k][i])
			tmp2 = tmp2 / (2.0 * stdEach[k][i] * stdEach[k][i])
			tmp2 = math.Exp(-tmp2)

			tmp3 = math.Sqrt(2.0*pi) * stdEach[k][i]
			tmp3 = 1.0 / tmp3

			tmp4 = tmp3 * tmp2

			tmp1 = tmp1 * tmp4

		}

		objFunc[k] = tmp1 * priorProbability[k]

	}

	imax := getMax(objFunc)
	objclass = catalog[imax]

	//fmt.Println(objFunc)
	return
}

func getMax(v []float64) (imax int) {
	/*------------------------------------------------------
	     Created  :  Song Yezhi   2022-6-13 1:31

	            get the index of the largest element   from a slice
	  --------------------------------------------------------
	     Input Parameters   :
	            v ---- float slice
	     Output Parameters  :
	            ind -- index of the largest element
	  -------------------------------------------------------*/
	imax = 0

	vmax := v[0]

	N := len(v)

	for i := 1; i < N; i++ {

		if v[i] > vmax {

			imax = i

			vmax = v[i]

		}

	}

	return
}

func naiveBayesTraining(finName string, catalog []string, lenAttribute int) (meanEach [][]float64,
	stdEach [][]float64, priorProbability []float64) {
	/*------------------------------------------------------
	     Created  :  Song Yezhi   2022-6-12 21:16
	          data training by LDA  get the mean and cov

	       ** important

	          log |sigma_k| = simma log D_kl
	         Ref:  P113 "the elments of statistical learning.
	  --------------------------------------------------------
	     Input Parameters   :
	          finName --- training data file name
	          catalog ---- string slice of the catalog
	          lenAttribute ---- the number of attribute without response varaiable

	          meanEach ----- each catalog's mean value

	     Output Parameters  :
	          meanEach ---- mean for each catalog
	          covEach  ---- covariance of each catalog
	          invCovEach ----
	          priorProbability ---- prior Probability of each catalog
	          D    -----   LDL of each catalog's covraiance
	  --------------------------------------------------------
	     Email        : song.yz@foxmail.com
	     Copyrigt (C) : Chinese Academy of Sciences
	                    All rights reserved,  2022
	  -------------------------------------------------------*/

	nCatalog := len(catalog)
	//how many classes of catalog

	fin, _ := os.Open(finName)
	defer fin.Close()

	br := bufio.NewReader(fin)

	numberEachCatalog := make([]int, nCatalog)
	//for compute PI

	priorProbability = Vec(nCatalog)

	meanEach = Mat(nCatalog, lenAttribute)
	//for each catalog 's mean
	// it is a matrix of  nCatalog * lenAttribute

	stdEach = Mat(nCatalog, lenAttribute)

	var k int

	for {

		line, ioerr := br.ReadString('\n')
		if ioerr == io.EOF {
			break
		}

		// in case of no content
		if strings.TrimSpace(line) == "" {
			continue
		}

		//the comment
		if line[0] == '#' {
			continue
		}

		word := strings.Split(line, ",")

		for i := 0; i < lenAttribute; i++ {

			tmp, _ := strconv.ParseFloat(strings.TrimSpace(word[i]), 64)

			// for each catalog compute the mean
			// comput the sum first
			for j := 0; j < nCatalog; j++ {

				if word[lenAttribute] == catalog[j] {

					meanEach[j][i] = meanEach[j][i] + tmp

				}

			}

		}

		for j := 0; j < nCatalog; j++ {
			if word[lenAttribute] == catalog[j] {
				numberEachCatalog[j] = numberEachCatalog[j] + 1
			}

		}

	}

	//compute each mean matrix ============================
	for i := 0; i < nCatalog; i++ {

		for j := 0; j < lenAttribute; j++ {
			meanEach[i][j] = meanEach[i][j] / float64(numberEachCatalog[i])
		}
	}

	//=========================================

	_, _ = fin.Seek(0, io.SeekStart)
	//set the file point to the beginning of the file
	//----------------------

	for {

		line, ioerr := br.ReadString('\n')
		if ioerr == io.EOF {
			break
		}

		// in case of no content
		if strings.TrimSpace(line) == "" {
			continue
		}

		word := strings.Split(line, ",")

		var wordFloat []float64

		wordFloat = Vec(lenAttribute)
		// temp vector for the

		for i := 0; i < lenAttribute; i++ {
			wordFloat[i], _ = strconv.ParseFloat(word[i], 64)
		}

		for k = 0; k < nCatalog; k++ {

			if word[lenAttribute] == catalog[k] {

				for i := 0; i < lenAttribute; i++ {

					stdEach[k][i] = stdEach[k][i] + (wordFloat[i]-meanEach[k][i])*(wordFloat[i]-meanEach[k][i])

				}

			}

		}

	}

	for k = 0; k < nCatalog; k++ {
		for j := 0; j < lenAttribute; j++ {
			stdEach[k][j] = math.Sqrt(stdEach[k][j] / (float64(numberEachCatalog[k] - 1)))
		}
	}

	Nlength := 0

	for i := 0; i < nCatalog; i++ {
		Nlength = Nlength + numberEachCatalog[i]
	}

	//compute the Prior probability of each catalog ====================
	// Ref -- pi  in the   <>
	for i := 0; i < nCatalog; i++ {
		priorProbability[i] = float64(numberEachCatalog[i]) / float64(Nlength)
	}

	return
}

func GetCatalog(finName string) (catalog []string, lenAttribute int) {
	/*------------------------------------------------------
	     Created  :  Song Yezhi   2022-6-11 23:43
	            get the classies of the training data
	  --------------------------------------------------------
	     Input Parameters   :
	            finName---- training data file name
	     Output Parameters  :
	            catalog---- string, classies of the data
	            lenAttribute ---- int ,how many attribute of the data without response varaiable
	  --------------------------------------------------------
	     Email        : song.yz@foxmail.com
	     Copyrigt (C) : Chinese Academy of Sciences
	                    All rights reserved,  2022
	  -------------------------------------------------------*/

	fin, _ := os.Open(finName)

	br := bufio.NewReader(fin)

	cataExist := 0

	for {
		line, ioerr := br.ReadString('\n')
		if ioerr == io.EOF {
			break
		}

		// in case of no content
		if strings.TrimSpace(line) == "" {
			continue
		}

		word := strings.Split(line, ",")

		lenAttribute = len(word) - 1
		//without response var

		cataLog := word[len(word)-1]

		cataExist = 0
		for i := 0; i < len(catalog); i++ {
			if cataLog == catalog[i] {
				cataExist = 1
			}
		}

		if cataExist == 0 {
			catalog = append(catalog, cataLog)
		}

	}

	fin.Close()

	return

}

func Vec(N int) (V []float64) {
	/*------------------------------------------------------
	!  Author  : Song Yezhi
	!  verison : 2020-5-24 17:50
	!
	!
	!  -----------------------------------------------------
	!  Input  Parameters :
	!
	!  Output Parameters :
	!
	------------------------------------------------------*/
	V = make([]float64, N)
	for i := 0; i < N; i++ {
		V[i] = 0.0
	}
	return
}

func Mat(M int, N int) (A [][]float64) {
	/*------------------------------------------------------
	  !  Author  : Song Yezhi
	  !  verison : 2020-5-23 23:43
	  !
	  !  creat a two dimention slices (matrix)
	  !  -----------------------------------------------------
	  !  Input  Parameters :
	  !
	  !  Output Parameters :
	  !
	  ------------------------------------------------------*/
	A = make([][]float64, M)
	for i := 0; i < M; i++ {
		A[i] = make([]float64, N)
	}

	for i := 0; i < M; i++ {
		for j := 0; j < N; j++ {
			A[i][j] = 0.0
		}
	}

	return
}

func MatOutput(A [][]float64) {
	/*------------------------------------------------------
	!  Author  : Song Yezhi
	!  verison : 2020-5-24 16:54
	!
	------------------------------------------------------*/
	M, N := MatDim(A)
	for i := 0; i < M; i++ {
		for j := 0; j < N; j++ {
			fmt.Printf("%18.6f   ", A[i][j])
		}
		fmt.Printf("\n")
	}
}

func VecOutput(V []float64) {
	N := len(V)
	for i := 0; i < N; i++ {
		fmt.Printf("%18.6f\n", V[i])
	}
}

func MatDim(A [][]float64) (M int, N int) {
	/*------------------------------------------------------
	!  Author  : Song Yezhi
	!  verison : 2020-5-24 12:45
	------------------------------------------------------*/
	M = len(A)
	N = len(A[M-1])
	return
}