统计机器学习
附录:朴素贝叶斯方法代码
这里给出源代码,仅供参考。 如对其他代码感兴趣,欢迎与我联系。 这里给出部分算法的源代码。代码由go语言编写。这些代码没有经过详细测试,所以可能有bug。
另外,对一些公开的数据测试中,我发现国外一些文献中同样的算法机器学习性能后比我的识别准确率要高。 因此,程序可能还有一些问题。请谨慎使用。
package main
import (
"bufio"
"fmt"
"io"
"math"
"os"
"strconv"
"strings"
)
func main() {
/*------------------------------------------------------
Created : Song Yezhi 2022-6-13 2:40
machine learning test
--------------------------------------------------------
Input Parameters :
Output Parameters :
--------------------------------------------------------
Email : song.yz@foxmail.com
Copyrigt (C) : Chinese Academy of Sciences
All rights reserved, 2022
-------------------------------------------------------*/
test()
}
func test() {
finName := "training.txt"
//tranning data
catalog, lenAttribute := GetCatalog(finName)
//tranning
fmt.Println("lenAttribute = %d", lenAttribute)
var meanEach [][]float64
var stdEach [][]float64
var priorProbability []float64
meanEach, stdEach, priorProbability = naiveBayesTraining(finName, catalog, lenAttribute)
debug := 0
if debug == 1 {
fmt.Println("meanEach =")
MatOutput(meanEach)
fmt.Println("stdEach =")
MatOutput(stdEach)
fmt.Println("priorProbability =")
VecOutput(priorProbability)
}
// begin test
ftestName := "test.txt"
ftest, _ := os.Open(ftestName)
defer ftest.Close()
br := bufio.NewReader(ftest)
objAttribute := Vec(lenAttribute)
i := 0
k := 0
for {
line, ioerr := br.ReadString('\n')
if ioerr == io.EOF {
break
}
// in case of no content
if strings.TrimSpace(line) == "" {
continue
}
//the comment
if line[0] == '#' {
continue
}
word := strings.Split(line, ",")
for i := 0; i < lenAttribute; i++ {
objAttribute[i], _ = strconv.ParseFloat(strings.TrimSpace(word[i]), 64)
}
_, objclass := naiveBayes(objAttribute, meanEach, stdEach, priorProbability, catalog)
// already get the right class
i = i + 1
fmt.Printf("the %d object predicion class is %s , the real object is %s \n", i, objclass, word[lenAttribute])
if strings.TrimSpace(objclass) == strings.TrimSpace(word[lenAttribute]) {
k = k + 1
}
}
fmt.Printf("The prediction accuracy is %f \n", float64(k)/float64(i))
}
func naiveBayes(objAttribute []float64, meanEach [][]float64,
stdEach [][]float64, priorProbability []float64,
catalog []string) (objFunc []float64, objclass string) {
/* ------------------------------------------------------
Created : Song Yezhi 2022-6-13 0:44
naiveBayes Classification
naive bayes method
Ref: "data mining and analysis"
--------------------------------------------------------
Input Parameters :
objAttribute[] ----- attribute of test object
meanEach ------------
stdEach ----------
priorProbalility ---- prior Probability of each catalog
catalog ---------- catalog in string form
Output Parameters :
objclass ---- the result of whic class respect to objAttribute
objFunc ---- object funtion value
--------------------------------------------------------
Email : song.yz@foxmail.com
Copyrigt (C) : Chinese Academy of Sciences
All rights reserved, 2022
------------------------------------------------------- */
nCatalog := len(catalog)
objFunc = Vec(nCatalog)
//object function Ref 4.24 "an introduction to statistiacl learning 2nd edtion"
// Ref 4.10 "the elements of statistical learning"
lenAttribute := len(objAttribute)
pi := 3.141592653589793238462643
var tmp1 float64
var tmp2 float64
var tmp3 float64
var tmp4 float64
for k := 0; k < nCatalog; k++ {
tmp1 = 1.0
for i := 0; i < lenAttribute; i++ {
tmp2 = (objAttribute[i] - meanEach[k][i]) * (objAttribute[i] - meanEach[k][i])
tmp2 = tmp2 / (2.0 * stdEach[k][i] * stdEach[k][i])
tmp2 = math.Exp(-tmp2)
tmp3 = math.Sqrt(2.0*pi) * stdEach[k][i]
tmp3 = 1.0 / tmp3
tmp4 = tmp3 * tmp2
tmp1 = tmp1 * tmp4
}
objFunc[k] = tmp1 * priorProbability[k]
}
imax := getMax(objFunc)
objclass = catalog[imax]
//fmt.Println(objFunc)
return
}
func getMax(v []float64) (imax int) {
/*------------------------------------------------------
Created : Song Yezhi 2022-6-13 1:31
get the index of the largest element from a slice
--------------------------------------------------------
Input Parameters :
v ---- float slice
Output Parameters :
ind -- index of the largest element
-------------------------------------------------------*/
imax = 0
vmax := v[0]
N := len(v)
for i := 1; i < N; i++ {
if v[i] > vmax {
imax = i
vmax = v[i]
}
}
return
}
func naiveBayesTraining(finName string, catalog []string, lenAttribute int) (meanEach [][]float64,
stdEach [][]float64, priorProbability []float64) {
/*------------------------------------------------------
Created : Song Yezhi 2022-6-12 21:16
data training by LDA get the mean and cov
** important
log |sigma_k| = simma log D_kl
Ref: P113 "the elments of statistical learning.
--------------------------------------------------------
Input Parameters :
finName --- training data file name
catalog ---- string slice of the catalog
lenAttribute ---- the number of attribute without response varaiable
meanEach ----- each catalog's mean value
Output Parameters :
meanEach ---- mean for each catalog
covEach ---- covariance of each catalog
invCovEach ----
priorProbability ---- prior Probability of each catalog
D ----- LDL of each catalog's covraiance
--------------------------------------------------------
Email : song.yz@foxmail.com
Copyrigt (C) : Chinese Academy of Sciences
All rights reserved, 2022
-------------------------------------------------------*/
nCatalog := len(catalog)
//how many classes of catalog
fin, _ := os.Open(finName)
defer fin.Close()
br := bufio.NewReader(fin)
numberEachCatalog := make([]int, nCatalog)
//for compute PI
priorProbability = Vec(nCatalog)
meanEach = Mat(nCatalog, lenAttribute)
//for each catalog 's mean
// it is a matrix of nCatalog * lenAttribute
stdEach = Mat(nCatalog, lenAttribute)
var k int
for {
line, ioerr := br.ReadString('\n')
if ioerr == io.EOF {
break
}
// in case of no content
if strings.TrimSpace(line) == "" {
continue
}
//the comment
if line[0] == '#' {
continue
}
word := strings.Split(line, ",")
for i := 0; i < lenAttribute; i++ {
tmp, _ := strconv.ParseFloat(strings.TrimSpace(word[i]), 64)
// for each catalog compute the mean
// comput the sum first
for j := 0; j < nCatalog; j++ {
if word[lenAttribute] == catalog[j] {
meanEach[j][i] = meanEach[j][i] + tmp
}
}
}
for j := 0; j < nCatalog; j++ {
if word[lenAttribute] == catalog[j] {
numberEachCatalog[j] = numberEachCatalog[j] + 1
}
}
}
//compute each mean matrix ============================
for i := 0; i < nCatalog; i++ {
for j := 0; j < lenAttribute; j++ {
meanEach[i][j] = meanEach[i][j] / float64(numberEachCatalog[i])
}
}
//=========================================
_, _ = fin.Seek(0, io.SeekStart)
//set the file point to the beginning of the file
//----------------------
for {
line, ioerr := br.ReadString('\n')
if ioerr == io.EOF {
break
}
// in case of no content
if strings.TrimSpace(line) == "" {
continue
}
word := strings.Split(line, ",")
var wordFloat []float64
wordFloat = Vec(lenAttribute)
// temp vector for the
for i := 0; i < lenAttribute; i++ {
wordFloat[i], _ = strconv.ParseFloat(word[i], 64)
}
for k = 0; k < nCatalog; k++ {
if word[lenAttribute] == catalog[k] {
for i := 0; i < lenAttribute; i++ {
stdEach[k][i] = stdEach[k][i] + (wordFloat[i]-meanEach[k][i])*(wordFloat[i]-meanEach[k][i])
}
}
}
}
for k = 0; k < nCatalog; k++ {
for j := 0; j < lenAttribute; j++ {
stdEach[k][j] = math.Sqrt(stdEach[k][j] / (float64(numberEachCatalog[k] - 1)))
}
}
Nlength := 0
for i := 0; i < nCatalog; i++ {
Nlength = Nlength + numberEachCatalog[i]
}
//compute the Prior probability of each catalog ====================
// Ref -- pi in the <>
for i := 0; i < nCatalog; i++ {
priorProbability[i] = float64(numberEachCatalog[i]) / float64(Nlength)
}
return
}
func GetCatalog(finName string) (catalog []string, lenAttribute int) {
/*------------------------------------------------------
Created : Song Yezhi 2022-6-11 23:43
get the classies of the training data
--------------------------------------------------------
Input Parameters :
finName---- training data file name
Output Parameters :
catalog---- string, classies of the data
lenAttribute ---- int ,how many attribute of the data without response varaiable
--------------------------------------------------------
Email : song.yz@foxmail.com
Copyrigt (C) : Chinese Academy of Sciences
All rights reserved, 2022
-------------------------------------------------------*/
fin, _ := os.Open(finName)
br := bufio.NewReader(fin)
cataExist := 0
for {
line, ioerr := br.ReadString('\n')
if ioerr == io.EOF {
break
}
// in case of no content
if strings.TrimSpace(line) == "" {
continue
}
word := strings.Split(line, ",")
lenAttribute = len(word) - 1
//without response var
cataLog := word[len(word)-1]
cataExist = 0
for i := 0; i < len(catalog); i++ {
if cataLog == catalog[i] {
cataExist = 1
}
}
if cataExist == 0 {
catalog = append(catalog, cataLog)
}
}
fin.Close()
return
}
func Vec(N int) (V []float64) {
/*------------------------------------------------------
! Author : Song Yezhi
! verison : 2020-5-24 17:50
!
!
! -----------------------------------------------------
! Input Parameters :
!
! Output Parameters :
!
------------------------------------------------------*/
V = make([]float64, N)
for i := 0; i < N; i++ {
V[i] = 0.0
}
return
}
func Mat(M int, N int) (A [][]float64) {
/*------------------------------------------------------
! Author : Song Yezhi
! verison : 2020-5-23 23:43
!
! creat a two dimention slices (matrix)
! -----------------------------------------------------
! Input Parameters :
!
! Output Parameters :
!
------------------------------------------------------*/
A = make([][]float64, M)
for i := 0; i < M; i++ {
A[i] = make([]float64, N)
}
for i := 0; i < M; i++ {
for j := 0; j < N; j++ {
A[i][j] = 0.0
}
}
return
}
func MatOutput(A [][]float64) {
/*------------------------------------------------------
! Author : Song Yezhi
! verison : 2020-5-24 16:54
!
------------------------------------------------------*/
M, N := MatDim(A)
for i := 0; i < M; i++ {
for j := 0; j < N; j++ {
fmt.Printf("%18.6f ", A[i][j])
}
fmt.Printf("\n")
}
}
func VecOutput(V []float64) {
N := len(V)
for i := 0; i < N; i++ {
fmt.Printf("%18.6f\n", V[i])
}
}
func MatDim(A [][]float64) (M int, N int) {
/*------------------------------------------------------
! Author : Song Yezhi
! verison : 2020-5-24 12:45
------------------------------------------------------*/
M = len(A)
N = len(A[M-1])
return
}