统计机器学习
附录:QDA代码
这里给出源代码,仅供参考。 如对其他代码感兴趣,欢迎与我联系。 这里给出部分算法的源代码。代码由go语言编写。这些代码没有经过详细测试,所以可能有bug。
另外,对一些公开的数据测试中,我发现国外一些文献中同样的算法机器学习性能后比我的识别准确率要高。 因此,程序可能还有一些问题。请谨慎使用。
package main
import (
"bufio"
"fmt"
"io"
"math"
"os"
"strconv"
"strings"
)
func main() {
/*------------------------------------------------------
Created : Song Yezhi 2022-6-13 2:40
machine learning test
--------------------------------------------------------
Input Parameters :
Output Parameters :
--------------------------------------------------------
Email : song.yz@foxmail.com
Copyrigt (C) : Chinese Academy of Sciences
All rights reserved, 2022
-------------------------------------------------------*/
test()
}
func test() {
finName := "training.txt"
//tranning data
catalog, lenAttribute := GetCatalog(finName)
//tranning
fmt.Println("lenAttribute = %d", lenAttribute)
var meanEach [][]float64
var covEach [][][]float64
var invCovEach [][][]float64
var priorProbability []float64
var D [][]float64
// D is the LDL of covariance
meanEach, covEach, invCovEach, priorProbability, D = QDAtraining(finName, catalog, lenAttribute)
debug := 0
if debug == 1 {
for k := 0; k < len(catalog); k++ {
fmt.Printf("the %d catatlog \n", k)
meanV := Vec(lenAttribute)
Dv := Vec(lenAttribute)
for i := 0; i < lenAttribute; i++ {
meanV[i] = meanEach[k][i]
Dv[i] = D[k][i]
}
fmt.Println("mean=")
VecOutput(meanV)
covMat := Mat(lenAttribute, lenAttribute)
invMat := Mat(lenAttribute, lenAttribute)
for i := 0; i < lenAttribute; i++ {
for j := 0; j < lenAttribute; j++ {
covMat[i][j] = covEach[k][i][j]
invMat[i][j] = invCovEach[k][i][j]
}
}
fmt.Println("cov=")
MatOutput(covMat)
fmt.Println("inv cov=")
MatOutput(invMat)
fmt.Println("D=")
VecOutput(Dv)
fmt.Println(" \n \n")
}
}
// begin test
ftestName := "test.txt"
ftest, _ := os.Open(ftestName)
defer ftest.Close()
br := bufio.NewReader(ftest)
objAttribute := Vec(lenAttribute)
i := 0
k := 0
for {
line, ioerr := br.ReadString('\n')
if ioerr == io.EOF {
break
}
// in case of no content
if strings.TrimSpace(line) == "" {
continue
}
//the comment
if line[0] == '#' {
continue
}
word := strings.Split(line, ",")
for i := 0; i < lenAttribute; i++ {
objAttribute[i], _ = strconv.ParseFloat(strings.TrimSpace(word[i]), 64)
}
_, objclass := QDA(objAttribute, meanEach, invCovEach, priorProbability, D, catalog)
// already get the right class
i = i + 1
fmt.Printf("the %d object predicion class is %s , the real object is %s \n", i, objclass, word[lenAttribute])
if strings.TrimSpace(objclass) == strings.TrimSpace(word[lenAttribute]) {
k = k + 1
}
}
fmt.Printf("The prediction accuracy is %f \n", float64(k)/float64(i))
}
func QDA(objAttribute []float64, meanEach [][]float64,
invCovEach [][][]float64, priorProbability []float64, D [][]float64,
catalog []string) (objFunc []float64, objclass string) {
/* ------------------------------------------------------
Created : Song Yezhi 2022-6-13 0:44
QDA Classification
** important
log |sigma_k| = simma log D_kl
Ref : P113 "the elments of statistical learning.
Ref: an introduction of statistical learning Eq.4.28
--------------------------------------------------------
Input Parameters :
objAttribute[] ----- attribute of test object
mean ------------ each mean of training data attribute
invCov ---------- inv matrix of covariance of the training data
priorProbalility ---- prior Probability of each catalog
catalog ---------- catalog in string form
D ----- LDL of each catalog's covraiance
Output Parameters :
objclass ---- the result of whic class respect to objAttribute
--------------------------------------------------------
Email : song.yz@foxmail.com
Copyrigt (C) : Chinese Academy of Sciences
All rights reserved, 2022
------------------------------------------------------- */
nCatalog := len(catalog)
objFunc = Vec(nCatalog)
//object function Ref 4.24 "an introduction to statistiacl learning 2nd edtion"
// Ref 4.10 "the elements of statistical learning"
lenAttribute := len(objAttribute)
miuk := Vec(lenAttribute)
sigmaMiu := Vec(lenAttribute)
sigmaX := Vec(lenAttribute)
var tmp1 float64
var tmp2 float64
var tmp3 float64
var tmp4 float64
var tmp5 float64
invCovTmp := Mat(lenAttribute, lenAttribute)
for k := 0; k < nCatalog; k++ {
for j := 0; j < lenAttribute; j++ {
miuk[j] = meanEach[k][j]
}
for i := 0; i < lenAttribute; i++ {
for j := 0; j < lenAttribute; j++ {
invCovTmp[i][j] = invCovEach[k][i][j]
}
}
sigmaX = MatVec(invCovTmp, objAttribute)
tmp1 = VecDot(objAttribute, sigmaX)
tmp1 = -0.5 * tmp1
//-----------------
sigmaMiu = MatVec(invCovTmp, miuk)
tmp2 = VecDot(objAttribute, sigmaMiu)
//-----------------------
tmp3 = VecDot(miuk, sigmaMiu)
tmp3 = -0.5 * tmp3
//--------------------
//========important ***
//Ref p113 ,the elements of statistical learning
// second edition
tmp4 = 0.0
for i := 0; i < lenAttribute; i++ {
tmp4 = tmp4 + math.Log(D[k][i])
}
tmp4 = -0.5 * tmp4
//====================
tmp5 = math.Log(priorProbability[k])
if 1==2 {
fmt.Println("tmp1")
fmt.Println(tmp1)
fmt.Println("tmp2")
fmt.Println(tmp2)
fmt.Println("tmp3")
fmt.Println(tmp3)
fmt.Println("tmp4")
fmt.Println(tmp4)
fmt.Println("tmp5")
fmt.Println(tmp5)
}
objFunc[k] = tmp1 + tmp2 + tmp3 + tmp4 + tmp5
}
imax := getMax(objFunc)
objclass = catalog[imax]
//fmt.Println(objFunc)
return
}
func getMax(v []float64) (imax int) {
/*------------------------------------------------------
Created : Song Yezhi 2022-6-13 1:31
get the index of the largest element from a slice
--------------------------------------------------------
Input Parameters :
v ---- float slice
Output Parameters :
ind -- index of the largest element
-------------------------------------------------------*/
imax = 0
vmax := v[0]
N := len(v)
for i := 1; i < N; i++ {
if v[i] > vmax {
imax = i
vmax = v[i]
}
}
return
}
func QDAtraining(finName string, catalog []string, lenAttribute int) (meanEach [][]float64,
covEach [][][]float64, invCovEach [][][]float64,
priorProbability []float64, D [][]float64) {
/*------------------------------------------------------
Created : Song Yezhi 2022-6-12 21:16
data training by LDA get the mean and cov
** important
log |sigma_k| = simma log D_kl
Ref: P113 "the elments of statistical learning.
--------------------------------------------------------
Input Parameters :
finName --- training data file name
catalog ---- string slice of the catalog
lenAttribute ---- the number of attribute without response varaiable
meanEach ----- each catalog's mean value
Output Parameters :
meanEach ---- mean for each catalog
covEach ---- covariance of each catalog
invCovEach ----
priorProbability ---- prior Probability of each catalog
D ----- LDL of each catalog's covraiance
--------------------------------------------------------
Email : song.yz@foxmail.com
Copyrigt (C) : Chinese Academy of Sciences
All rights reserved, 2022
-------------------------------------------------------*/
nCatalog := len(catalog)
//how many classes of catalog
fin, _ := os.Open(finName)
defer fin.Close()
br := bufio.NewReader(fin)
numberEachCatalog := make([]int, nCatalog)
//for compute PI
priorProbability = Vec(nCatalog)
meanEach = Mat(nCatalog, lenAttribute)
//for each catalog 's mean
// it is a matrix of nCatalog * lenAttribute
D = Mat(nCatalog, lenAttribute)
//---------------------------------
// cov matrix for each catalog
covEach = make([][][]float64, nCatalog)
for i := 0; i < nCatalog; i++ {
covEach[i] = Mat(lenAttribute, lenAttribute)
}
//------------------------------------
//---------------------------------
// invers cov matrix for each catalog
invCovEach = make([][][]float64, nCatalog)
for i := 0; i < nCatalog; i++ {
invCovEach[i] = Mat(lenAttribute, lenAttribute)
}
//------------------------------------
var k int
for {
line, ioerr := br.ReadString('\n')
if ioerr == io.EOF {
break
}
// in case of no content
if strings.TrimSpace(line) == "" {
continue
}
if line[0] == '#' {
continue
}
word := strings.Split(line, ",")
for i := 0; i < lenAttribute; i++ {
tmp, _ := strconv.ParseFloat(strings.TrimSpace(word[i]), 64)
// for each catalog compute the mean
// comput the sum first
for j := 0; j < nCatalog; j++ {
if word[lenAttribute] == catalog[j] {
meanEach[j][i] = meanEach[j][i] + tmp
}
}
}
for j := 0; j < nCatalog; j++ {
if word[lenAttribute] == catalog[j] {
numberEachCatalog[j] = numberEachCatalog[j] + 1
}
}
}
//compute each mean matrix ============================
for i := 0; i < nCatalog; i++ {
for j := 0; j < lenAttribute; j++ {
meanEach[i][j] = meanEach[i][j] / float64(numberEachCatalog[i])
}
}
//=========================================
_, _ = fin.Seek(0, io.SeekStart)
//set the file point to the beginning of the file
//----------------------
for {
line, ioerr := br.ReadString('\n')
if ioerr == io.EOF {
break
}
// in case of no content
if strings.TrimSpace(line) == "" {
continue
}
word := strings.Split(line, ",")
var wordFloat []float64
wordFloat = Vec(lenAttribute)
// temp vector for the
for i := 0; i < lenAttribute; i++ {
wordFloat[i], _ = strconv.ParseFloat(word[i], 64)
}
for i := 0; i < lenAttribute; i++ {
for j := 0; j < lenAttribute; j++ {
for k := 0; k < nCatalog; k++ {
if word[lenAttribute] == catalog[k] {
covEach[k][i][j] = covEach[k][i][j] + (wordFloat[i]-meanEach[k][i])*(wordFloat[j]-meanEach[k][j])
}
}
}
}
}
//compute the each covariance of the training data============
for k = 0; k < nCatalog; k++ {
for i := 0; i < lenAttribute; i++ {
for j := 0; j < lenAttribute; j++ {
covEach[k][i][j] = covEach[k][i][j] / float64(numberEachCatalog[k]-1)
}
}
}
// compute invers cov matrix for each catalog
for k = 0; k < nCatalog; k++ {
matTMP1 := Mat(lenAttribute, lenAttribute)
for i := 0; i < lenAttribute; i++ {
for j := 0; j < lenAttribute; j++ {
matTMP1[i][j] = covEach[k][i][j]
}
}
matTMP2 := Mat(lenAttribute, lenAttribute)
matTMP2 = invMat(matTMP1)
for i := 0; i < lenAttribute; i++ {
for j := 0; j < lenAttribute; j++ {
invCovEach[k][i][j] = matTMP2[i][j]
}
}
Dtmp := Vec(lenAttribute)
_, Dtmp = LDL(matTMP1)
for i := 0; i < lenAttribute; i++ {
D[k][i] = Dtmp[i]
}
}
Nlength := 0
for i := 0; i < nCatalog; i++ {
Nlength = Nlength + numberEachCatalog[i]
}
//compute the Prior probability of each catalog ====================
// Ref -- pi in the <>
for i := 0; i < nCatalog; i++ {
priorProbability[i] = float64(numberEachCatalog[i]) / float64(Nlength)
}
return
}
func GetCatalog(finName string) (catalog []string, lenAttribute int) {
/*------------------------------------------------------
Created : Song Yezhi 2022-6-11 23:43
get the classies of the training data
--------------------------------------------------------
Input Parameters :
finName---- training data file name
Output Parameters :
catalog---- string, classies of the data
lenAttribute ---- int ,how many attribute of the data without response varaiable
--------------------------------------------------------
Email : song.yz@foxmail.com
Copyrigt (C) : Chinese Academy of Sciences
All rights reserved, 2022
-------------------------------------------------------*/
fin, _ := os.Open(finName)
br := bufio.NewReader(fin)
cataExist := 0
for {
line, ioerr := br.ReadString('\n')
if ioerr == io.EOF {
break
}
// in case of no content
if strings.TrimSpace(line) == "" {
continue
}
word := strings.Split(line, ",")
lenAttribute = len(word) - 1
//without response var
cataLog := word[len(word)-1]
cataExist = 0
for i := 0; i < len(catalog); i++ {
if cataLog == catalog[i] {
cataExist = 1
}
}
if cataExist == 0 {
catalog = append(catalog, cataLog)
}
}
fin.Close()
return
}
func Vec(N int) (V []float64) {
/*------------------------------------------------------
! Author : Song Yezhi
! verison : 2020-5-24 17:50
!
!
! -----------------------------------------------------
! Input Parameters :
!
! Output Parameters :
!
------------------------------------------------------*/
V = make([]float64, N)
for i := 0; i < N; i++ {
V[i] = 0.0
}
return
}
func Mat(M int, N int) (A [][]float64) {
/*------------------------------------------------------
! Author : Song Yezhi
! verison : 2020-5-23 23:43
!
! creat a two dimention slices (matrix)
! -----------------------------------------------------
! Input Parameters :
!
! Output Parameters :
!
------------------------------------------------------*/
A = make([][]float64, M)
for i := 0; i < M; i++ {
A[i] = make([]float64, N)
}
for i := 0; i < M; i++ {
for j := 0; j < N; j++ {
A[i][j] = 0.0
}
}
return
}
func MatOutput(A [][]float64) {
/*------------------------------------------------------
! Author : Song Yezhi
! verison : 2020-5-24 16:54
!
------------------------------------------------------*/
M, N := MatDim(A)
for i := 0; i < M; i++ {
for j := 0; j < N; j++ {
fmt.Printf("%18.6f ", A[i][j])
}
fmt.Printf("\n")
}
}
func VecOutput(V []float64) {
N := len(V)
for i := 0; i < N; i++ {
fmt.Printf("%18.6f\n", V[i])
}
}
func MatDim(A [][]float64) (M int, N int) {
/*------------------------------------------------------
! Author : Song Yezhi
! verison : 2020-5-24 12:45
------------------------------------------------------*/
M = len(A)
N = len(A[M-1])
return
}
func invlowtri(L [][]float64) (R [][]float64) {
/*------------------------------------------ comment
Author : Song Yezhi 2022-06-12 23:59:13
inverse of lower triangula matrix
---------------------------------------------------
Desciption : inverse of lower triangula matrix
*
a11 b11
a21 a22 b21 b22
a31 a32 a33 b31 b32 b33
a41 a42 a43 a44 b41 b42 b43 b44
a51 a52 a53 a54 a55 b51 b52 b53 b54 b55
c11
c21 c22
c31 c32 c33
c41 c42 c43 c44
c51 c52 c53 c54 c55
0+a(4,2)*b(2,2)+a(4,3)*b(3,2)+a(4,4)*b(4,2)+0=0
a(4,4)*b(4,2)= a(4,k)*b(k,2) k=2,4
Input Para :
L--- lower triangula matrix
Output Para :
R---inverse of L(also lower triangula matrix)
Ref:
1.
Chapter5,p228
2.
P58.
-------------------------------------------------*/
M, _ := MatDim(L)
var i, j, k int
var sum float64
R = Mat(M, M)
for i = 0; i < M; i++ {
R[i][i] = 1.0 / L[i][i]
}
for i = 0; i < M; i++ {
for j = i + 1; j < M; j++ {
sum = 0.0
for k = i; k < j; k++ {
sum = sum + L[j][k]*R[k][i]
R[j][i] = -sum * R[j][j]
}
}
}
return
}
func invuptri(U [][]float64) (R [][]float64) {
/*------------------------------------------ comment
Author : song yezhi
Date : 2022-06-13
---------------------------------------------------
Desciption : inverse of lower triangula matrix
*
Input Para :
U--- uptri matrix
Output Para :
R---inverse of U(also upper triangula matrix)
-------------------------------------------------*/
N, _ := MatDim(U)
UT := Mat(N, N)
UT = Transepose(U)
var R1 [][]float64
R1 = invlowtri(UT)
R = Transepose(R1)
return
}
func invMat(A [][]float64) (invA [][]float64) {
/*------------------------------------------ comment
Author : song yezhi
Date : 2022.06.13
inverse of symmetric positive definite matrix
---------------------------------------------------
Desciption : inv of uptri matrix
*
A=LDL'
invA=
=inv(L)inv(D)inv(L')
= (inv(L'))' inv(D) inv(L')
Input Para :
A ---- ymmetric positive definite matrix
Output Para :
invA---inv of A
Ref:
1.
Chapter5,p228
-------------------------------------------------*/
N, _ := MatDim(A)
var L [][]float64
var D []float64
L, D = LDL(A)
var R [][]float64
R = invlowtri(L)
var iU [][]float64
iU = Transepose(R)
for i := 0; i < N; i++ {
for j := 0; j < N; j++ {
iU[i][j] = iU[i][j] / D[j]
}
}
invA = MatMul(iU, R)
return
}
func Transepose(A [][]float64) (AT [][]float64) {
/*------------------------------------------------------
Created : Song Yezhi 2022-6-13 0:08
--------------------------------------------------------
Input Parameters :
Output Parameters :
-------------------------------------------------------*/
M, N := MatDim(A)
AT = Mat(N, M)
for i := 0; i < N; i++ {
for j := 0; j < M; j++ {
AT[i][j] = A[j][i]
}
}
return
}
func LDL(A [][]float64) (L [][]float64, D []float64) {
/*------------------------------------------------------
! Author : Song Yezhi
! verison : 2020-5-24 17:24
!
! A=LDL'
! -----------------------------------------------------
! Input Parameters :
! A
! Output Parameters :
! L
! D
------------------------------------------------------*/
M, N := MatDim(A)
if M != N {
fmt.Printf("the rows and cols is not equal")
}
G := Mat(N, N)
L = Mat(N, N)
D = Vec(N)
var i, j, k int
D[0] = A[0][0]
var tmp1 float64
for i = 1; i < N; i++ {
for j = 0; j <= i-1; j++ {
tmp1 = 0.0
for k = 0; k <= j-1; k++ {
tmp1 = tmp1 + G[i][k]*L[j][k]
}
G[i][j] = A[i][j] - tmp1
}
for j = 0; j <= i-1; j++ {
L[i][j] = G[i][j] / D[j]
}
//comput D[i]
tmp1 = 0.0
for k = 0; k <= i-1; k++ {
tmp1 = tmp1 + G[i][k]*L[i][k]
}
D[i] = A[i][i] - tmp1
}
//matOutput(L)
//set the diag elements
for i = 0; i < N; i++ {
L[i][i] = 1.0
}
return
}
func MatMul(A [][]float64, B [][]float64) (C [][]float64) {
/*------------------------------------------------------
! Author : Song Yezhi
! verison : 2020-5-23 23:27
! C = A*B
!
! -----------------------------------------------------
! Input Parameters :
! A[M][N]
! B[N][P]
! Output Parameters :
! C[M][P]
------------------------------------------------------*/
M := len(A)
N := len(A[M-1])
P := len(B[N-1])
var i, j, k int
C = Mat(M, P)
var sum float64
for i = 0; i < M; i++ {
for j = 0; j < P; j++ {
sum = 0.0
for k = 0; k < N; k++ {
sum = sum + A[i][k]*B[k][j]
C[i][j] = sum
}
}
}
return
}
func VecDot(x1 []float64, x2 []float64) (dot float64) {
/*------------------------------------------------------
! Author : Song Yezhi
! verison : 2020-5-19 22:53
!
! dot product
------------------------------------------------------*/
N := len(x1)
var i int
dot = 0.0
for i = 0; i < N; i++ {
dot = dot + x1[i]*x2[i]
}
return
}
func MatVec(A [][]float64, x []float64) (b []float64) {
/*------------------------------------------------------
! Author : Song Yezhi
! verison : 2020-5-19 22:25
! b = Ax
!
! -----------------------------------------------------
! Input Parameters :
! A[M][N] ----MATRIX
! Output Parameters :
! b[M]-----vector
------------------------------------------------------*/
M := len(A)
N := len(A[0])
//the size of mat
//b = make([]float64, M)
b = Vec(M)
var sum float64
for i := 0; i < M; i++ {
sum = 0.0
for j := 0; j < N; j++ {
sum = sum + A[i][j]*x[j]
}
b[i] = sum
}
return
}