song.yz@foxmail.com wechat: math-box

统计机器学习

附录:QDA代码



这里给出源代码,仅供参考。 如对其他代码感兴趣,欢迎与我联系。 这里给出部分算法的源代码。代码由go语言编写。这些代码没有经过详细测试,所以可能有bug。

另外,对一些公开的数据测试中,我发现国外一些文献中同样的算法机器学习性能后比我的识别准确率要高。 因此,程序可能还有一些问题。请谨慎使用。

package main

import (
	"bufio"
	"fmt"
	"io"
	"math"
	"os"
	"strconv"
	"strings"
)

func main() {
	/*------------------------------------------------------
	     Created  :  Song Yezhi   2022-6-13 2:40
	          machine learning  test
	  --------------------------------------------------------
	     Input Parameters   :

	     Output Parameters  :

	  --------------------------------------------------------
	     Email        : song.yz@foxmail.com
	     Copyrigt (C) : Chinese Academy of Sciences
	                    All rights reserved,  2022
	  -------------------------------------------------------*/
	test()

}

func test() {

	finName := "training.txt"
	//tranning data

	catalog, lenAttribute := GetCatalog(finName)
	//tranning
	fmt.Println("lenAttribute = %d", lenAttribute)

	var meanEach [][]float64

	var covEach [][][]float64

	var invCovEach [][][]float64
	var priorProbability []float64

	var D [][]float64
	// D is the LDL of covariance

	meanEach, covEach, invCovEach, priorProbability, D = QDAtraining(finName, catalog, lenAttribute)

	debug := 0

	if debug == 1 {

		for k := 0; k < len(catalog); k++ {

			fmt.Printf("the %d catatlog \n", k)

			meanV := Vec(lenAttribute)
			Dv := Vec(lenAttribute)

			for i := 0; i < lenAttribute; i++ {
				meanV[i] = meanEach[k][i]
				Dv[i] = D[k][i]
			}

			fmt.Println("mean=")
			VecOutput(meanV)

			covMat := Mat(lenAttribute, lenAttribute)
			invMat := Mat(lenAttribute, lenAttribute)

			for i := 0; i < lenAttribute; i++ {
				for j := 0; j < lenAttribute; j++ {

					covMat[i][j] = covEach[k][i][j]
					invMat[i][j] = invCovEach[k][i][j]
				}
			}

			fmt.Println("cov=")
			MatOutput(covMat)

			fmt.Println("inv cov=")
			MatOutput(invMat)

			fmt.Println("D=")
			VecOutput(Dv)

			fmt.Println(" \n \n")

		}

	}

	// begin test
	ftestName := "test.txt"
	ftest, _ := os.Open(ftestName)
	defer ftest.Close()

	br := bufio.NewReader(ftest)

	objAttribute := Vec(lenAttribute)

	i := 0
	k := 0
	for {

		line, ioerr := br.ReadString('\n')
		if ioerr == io.EOF {
			break
		}

		// in case of no content
		if strings.TrimSpace(line) == "" {
			continue
		}

		//the comment
		if line[0] == '#' {
			continue
		}

		word := strings.Split(line, ",")

		for i := 0; i < lenAttribute; i++ {
			objAttribute[i], _ = strconv.ParseFloat(strings.TrimSpace(word[i]), 64)
		}

		_, objclass := QDA(objAttribute, meanEach, invCovEach, priorProbability, D, catalog)
		// already get the right class

		i = i + 1
		fmt.Printf("the %d object predicion class is %s , the real object is %s \n", i, objclass, word[lenAttribute])

		if strings.TrimSpace(objclass) == strings.TrimSpace(word[lenAttribute]) {
			k = k + 1
		}

	}

	fmt.Printf("The prediction accuracy is %f \n", float64(k)/float64(i))
}

func QDA(objAttribute []float64, meanEach [][]float64,
	invCovEach [][][]float64, priorProbability []float64, D [][]float64,
	catalog []string) (objFunc []float64, objclass string) {
	/* ------------------------------------------------------
	      Created  :  Song Yezhi   2022-6-13 0:44
	        QDA Classification

	        	** important
	          log |sigma_k| = simma log D_kl
	          Ref :  P113 "the elments of statistical learning.


	          Ref: an introduction of statistical learning  Eq.4.28
	   --------------------------------------------------------
	      Input Parameters   :
	           objAttribute[]  -----  attribute of test object
	           mean ------------  each mean of training data attribute
	           invCov ---------- inv matrix of covariance of the training data
	           priorProbalility ---- prior Probability of each catalog
	           catalog  ----------  catalog in string form
	           D    -----   LDL of each catalog's covraiance
	      Output Parameters  :
	           objclass  ---- the result of whic class  respect to objAttribute
	   --------------------------------------------------------
	      Email        : song.yz@foxmail.com
	      Copyrigt (C) : Chinese Academy of Sciences
	                     All rights reserved,  2022
	   ------------------------------------------------------- */
	nCatalog := len(catalog)

	objFunc = Vec(nCatalog)
	//object function Ref 4.24 "an introduction to statistiacl learning 2nd edtion"
	//   Ref  4.10 "the elements of statistical learning"

	lenAttribute := len(objAttribute)

	miuk := Vec(lenAttribute)
	sigmaMiu := Vec(lenAttribute)
	sigmaX := Vec(lenAttribute)

	var tmp1 float64
	var tmp2 float64
	var tmp3 float64
	var tmp4 float64
	var tmp5 float64

	invCovTmp := Mat(lenAttribute, lenAttribute)

	for k := 0; k < nCatalog; k++ {

		for j := 0; j < lenAttribute; j++ {
			miuk[j] = meanEach[k][j]
		}

		for i := 0; i < lenAttribute; i++ {
			for j := 0; j < lenAttribute; j++ {

				invCovTmp[i][j] = invCovEach[k][i][j]
			}
		}

		sigmaX = MatVec(invCovTmp, objAttribute)

		tmp1 = VecDot(objAttribute, sigmaX)

		tmp1 = -0.5 * tmp1

		//-----------------

		sigmaMiu = MatVec(invCovTmp, miuk)

		tmp2 = VecDot(objAttribute, sigmaMiu)

		//-----------------------

		tmp3 = VecDot(miuk, sigmaMiu)
		tmp3 = -0.5 * tmp3
		//--------------------

		//========important ***
		//Ref p113 ,the elements of statistical learning
		// second edition
		tmp4 = 0.0

		for i := 0; i < lenAttribute; i++ {

			tmp4 = tmp4 + math.Log(D[k][i])

		}
		
		tmp4 = -0.5 * tmp4

		//====================
		tmp5 = math.Log(priorProbability[k])
		
		 if 1==2 {
		
		fmt.Println("tmp1")
		fmt.Println(tmp1)
		fmt.Println("tmp2")
		fmt.Println(tmp2)
		fmt.Println("tmp3")
		fmt.Println(tmp3)
		fmt.Println("tmp4")
		fmt.Println(tmp4)
		fmt.Println("tmp5")
		fmt.Println(tmp5)
	  }

		objFunc[k] = tmp1 + tmp2 + tmp3 + tmp4 + tmp5

	}

	imax := getMax(objFunc)

	objclass = catalog[imax]

	//fmt.Println(objFunc)
	return
}

func getMax(v []float64) (imax int) {
	/*------------------------------------------------------
	     Created  :  Song Yezhi   2022-6-13 1:31

	            get the index of the largest element   from a slice
	  --------------------------------------------------------
	     Input Parameters   :
	            v ---- float slice
	     Output Parameters  :
	            ind -- index of the largest element
	  -------------------------------------------------------*/
	imax = 0

	vmax := v[0]

	N := len(v)

	for i := 1; i < N; i++ {

		if v[i] > vmax {

			imax = i

			vmax = v[i]

		}

	}

	return
}

func QDAtraining(finName string, catalog []string, lenAttribute int) (meanEach [][]float64,
	covEach [][][]float64, invCovEach [][][]float64,
	priorProbability []float64, D [][]float64) {
	/*------------------------------------------------------
	     Created  :  Song Yezhi   2022-6-12 21:16
	          data training by LDA  get the mean and cov

	       ** important

	          log |sigma_k| = simma log D_kl
	         Ref:  P113 "the elments of statistical learning.
	  --------------------------------------------------------
	     Input Parameters   :
	          finName --- training data file name
	          catalog ---- string slice of the catalog
	          lenAttribute ---- the number of attribute without response varaiable

	          meanEach ----- each catalog's mean value

	     Output Parameters  :
	          meanEach ---- mean for each catalog
	          covEach  ---- covariance of each catalog
	          invCovEach ----
	          priorProbability ---- prior Probability of each catalog
	          D    -----   LDL of each catalog's covraiance
	  --------------------------------------------------------
	     Email        : song.yz@foxmail.com
	     Copyrigt (C) : Chinese Academy of Sciences
	                    All rights reserved,  2022
	  -------------------------------------------------------*/

	nCatalog := len(catalog)
	//how many classes of catalog

	fin, _ := os.Open(finName)
	defer fin.Close()

	br := bufio.NewReader(fin)

	numberEachCatalog := make([]int, nCatalog)
	//for compute PI

	priorProbability = Vec(nCatalog)

	meanEach = Mat(nCatalog, lenAttribute)
	//for each catalog 's mean
	// it is a matrix of  nCatalog * lenAttribute

	D = Mat(nCatalog, lenAttribute)

	//---------------------------------
	// cov matrix for each catalog
	covEach = make([][][]float64, nCatalog)

	for i := 0; i < nCatalog; i++ {
		covEach[i] = Mat(lenAttribute, lenAttribute)
	}
	//------------------------------------

	//---------------------------------
	// invers cov matrix for each catalog
	invCovEach = make([][][]float64, nCatalog)

	for i := 0; i < nCatalog; i++ {
		invCovEach[i] = Mat(lenAttribute, lenAttribute)
	}
	//------------------------------------

	var k int

	for {

		line, ioerr := br.ReadString('\n')
		if ioerr == io.EOF {
			break
		}

		// in case of no content
		if strings.TrimSpace(line) == "" {
			continue
		}

		if line[0] == '#' {
			continue
		}

		word := strings.Split(line, ",")

		for i := 0; i < lenAttribute; i++ {

			tmp, _ := strconv.ParseFloat(strings.TrimSpace(word[i]), 64)

			// for each catalog compute the mean
			// comput the sum first
			for j := 0; j < nCatalog; j++ {

				if word[lenAttribute] == catalog[j] {

					meanEach[j][i] = meanEach[j][i] + tmp

				}

			}

		}

		for j := 0; j < nCatalog; j++ {
			if word[lenAttribute] == catalog[j] {
				numberEachCatalog[j] = numberEachCatalog[j] + 1
			}

		}

	}

	//compute each mean matrix ============================
	for i := 0; i < nCatalog; i++ {

		for j := 0; j < lenAttribute; j++ {
			meanEach[i][j] = meanEach[i][j] / float64(numberEachCatalog[i])
		}
	}

	//=========================================

	_, _ = fin.Seek(0, io.SeekStart)
	//set the file point to the beginning of the file
	//----------------------

	for {

		line, ioerr := br.ReadString('\n')
		if ioerr == io.EOF {
			break
		}

		// in case of no content
		if strings.TrimSpace(line) == "" {
			continue
		}

		word := strings.Split(line, ",")

		var wordFloat []float64

		wordFloat = Vec(lenAttribute)
		// temp vector for the

		for i := 0; i < lenAttribute; i++ {
			wordFloat[i], _ = strconv.ParseFloat(word[i], 64)
		}

		for i := 0; i < lenAttribute; i++ {
			for j := 0; j < lenAttribute; j++ {

				for k := 0; k < nCatalog; k++ {
					if word[lenAttribute] == catalog[k] {

						covEach[k][i][j] = covEach[k][i][j] + (wordFloat[i]-meanEach[k][i])*(wordFloat[j]-meanEach[k][j])

					}
				}
			}
		}

	}

	//compute the each covariance of the training data============

	for k = 0; k < nCatalog; k++ {

		for i := 0; i < lenAttribute; i++ {
			for j := 0; j < lenAttribute; j++ {

				covEach[k][i][j] = covEach[k][i][j] / float64(numberEachCatalog[k]-1)

			}
		}
	}

	// compute invers cov matrix for each catalog

	for k = 0; k < nCatalog; k++ {

		matTMP1 := Mat(lenAttribute, lenAttribute)

		for i := 0; i < lenAttribute; i++ {

			for j := 0; j < lenAttribute; j++ {

				matTMP1[i][j] = covEach[k][i][j]
			}
		}

		matTMP2 := Mat(lenAttribute, lenAttribute)

		matTMP2 = invMat(matTMP1)

		for i := 0; i < lenAttribute; i++ {

			for j := 0; j < lenAttribute; j++ {

				invCovEach[k][i][j] = matTMP2[i][j]
			}
		}

		Dtmp := Vec(lenAttribute)

		_, Dtmp = LDL(matTMP1)

		for i := 0; i < lenAttribute; i++ {

			D[k][i] = Dtmp[i]

		}

	}

	Nlength := 0

	for i := 0; i < nCatalog; i++ {
		Nlength = Nlength + numberEachCatalog[i]
	}

	//compute the Prior probability of each catalog ====================
	// Ref -- pi  in the   <>
	for i := 0; i < nCatalog; i++ {
		priorProbability[i] = float64(numberEachCatalog[i]) / float64(Nlength)
	}

	return
}

func GetCatalog(finName string) (catalog []string, lenAttribute int) {
	/*------------------------------------------------------
	     Created  :  Song Yezhi   2022-6-11 23:43
	            get the classies of the training data
	  --------------------------------------------------------
	     Input Parameters   :
	            finName---- training data file name
	     Output Parameters  :
	            catalog---- string, classies of the data
	            lenAttribute ---- int ,how many attribute of the data without response varaiable
	  --------------------------------------------------------
	     Email        : song.yz@foxmail.com
	     Copyrigt (C) : Chinese Academy of Sciences
	                    All rights reserved,  2022
	  -------------------------------------------------------*/

	fin, _ := os.Open(finName)

	br := bufio.NewReader(fin)

	cataExist := 0

	for {
		line, ioerr := br.ReadString('\n')
		if ioerr == io.EOF {
			break
		}

		// in case of no content
		if strings.TrimSpace(line) == "" {
			continue
		}

		word := strings.Split(line, ",")

		lenAttribute = len(word) - 1
		//without response var

		cataLog := word[len(word)-1]

		cataExist = 0
		for i := 0; i < len(catalog); i++ {
			if cataLog == catalog[i] {
				cataExist = 1
			}
		}

		if cataExist == 0 {
			catalog = append(catalog, cataLog)
		}

	}

	fin.Close()

	return

}

func Vec(N int) (V []float64) {
	/*------------------------------------------------------
	!  Author  : Song Yezhi
	!  verison : 2020-5-24 17:50
	!
	!
	!  -----------------------------------------------------
	!  Input  Parameters :
	!
	!  Output Parameters :
	!
	------------------------------------------------------*/
	V = make([]float64, N)
	for i := 0; i < N; i++ {
		V[i] = 0.0
	}
	return
}

func Mat(M int, N int) (A [][]float64) {
	/*------------------------------------------------------
	  !  Author  : Song Yezhi
	  !  verison : 2020-5-23 23:43
	  !
	  !  creat a two dimention slices (matrix)
	  !  -----------------------------------------------------
	  !  Input  Parameters :
	  !
	  !  Output Parameters :
	  !
	  ------------------------------------------------------*/
	A = make([][]float64, M)
	for i := 0; i < M; i++ {
		A[i] = make([]float64, N)
	}

	for i := 0; i < M; i++ {
		for j := 0; j < N; j++ {
			A[i][j] = 0.0
		}
	}

	return
}

func MatOutput(A [][]float64) {
	/*------------------------------------------------------
	!  Author  : Song Yezhi
	!  verison : 2020-5-24 16:54
	!
	------------------------------------------------------*/
	M, N := MatDim(A)
	for i := 0; i < M; i++ {
		for j := 0; j < N; j++ {
			fmt.Printf("%18.6f   ", A[i][j])
		}
		fmt.Printf("\n")
	}
}

func VecOutput(V []float64) {
	N := len(V)
	for i := 0; i < N; i++ {
		fmt.Printf("%18.6f\n", V[i])
	}
}

func MatDim(A [][]float64) (M int, N int) {
	/*------------------------------------------------------
	!  Author  : Song Yezhi
	!  verison : 2020-5-24 12:45
	------------------------------------------------------*/
	M = len(A)
	N = len(A[M-1])
	return
}

func invlowtri(L [][]float64) (R [][]float64) {
	/*------------------------------------------ comment
	Author      :   Song Yezhi   2022-06-12 23:59:13
	           inverse of lower triangula matrix
	---------------------------------------------------
	Desciption  :  inverse of lower triangula matrix
	 *
	     a11                   b11
	     a21 a22               b21 b22
	     a31 a32 a33           b31 b32 b33
	     a41 a42 a43 a44       b41 b42 b43 b44
	     a51 a52 a53 a54 a55   b51 b52 b53 b54 b55
	     c11
	     c21 c22
	     c31 c32 c33
	     c41 c42 c43 c44
	     c51 c52 c53 c54 c55
	     
	     0+a(4,2)*b(2,2)+a(4,3)*b(3,2)+a(4,4)*b(4,2)+0=0
	       a(4,4)*b(4,2)= a(4,k)*b(k,2)  k=2,4

	Input Para  :
	      L--- lower triangula matrix
	Output Para :
	      R---inverse of L(also lower triangula matrix)

	Ref:
	       1. 
	           Chapter5,p228
	       2. 
	           P58.
	-------------------------------------------------*/

	M, _ := MatDim(L)

	var i, j, k int
	var sum float64

	R = Mat(M, M)

	for i = 0; i < M; i++ {

		R[i][i] = 1.0 / L[i][i]

	}

	for i = 0; i < M; i++ {

		for j = i + 1; j < M; j++ {

			sum = 0.0
			for k = i; k < j; k++ {

				sum = sum + L[j][k]*R[k][i]
				R[j][i] = -sum * R[j][j]

			}

		}

	}

	return
}

func invuptri(U [][]float64) (R [][]float64) {
	/*------------------------------------------ comment
	Author      : song yezhi
	Date        : 2022-06-13
	---------------------------------------------------
	Desciption  :  inverse of lower triangula matrix
	 *
	Input Para  :
	      U--- uptri matrix
	Output Para :
	      R---inverse of U(also upper triangula matrix)
	-------------------------------------------------*/

	N, _ := MatDim(U)

	UT := Mat(N, N)

	UT = Transepose(U)

	var R1 [][]float64

	R1 = invlowtri(UT)

	R = Transepose(R1)

	return
}

func invMat(A [][]float64) (invA [][]float64) {
	/*------------------------------------------ comment
	Author      : song yezhi
	Date        : 2022.06.13
	   inverse of   symmetric positive definite matrix
	---------------------------------------------------
	Desciption  :  inv of uptri matrix
	 *
	            A=LDL'
	            invA=
	                =inv(L)inv(D)inv(L')
	                = (inv(L'))' inv(D) inv(L')
	Input Para  :
	      A ---- ymmetric positive definite matrix
	Output Para :
	      invA---inv of A

	Ref:
	       1. 
	           Chapter5,p228
	-------------------------------------------------*/

	N, _ := MatDim(A)

	var L [][]float64
	var D []float64

	L, D = LDL(A)

	var R [][]float64

	R = invlowtri(L)

	var iU [][]float64

	iU = Transepose(R)

	for i := 0; i < N; i++ {
		for j := 0; j < N; j++ {
			iU[i][j] = iU[i][j] / D[j]
		}
	}

	invA = MatMul(iU, R)

	return
}

func Transepose(A [][]float64) (AT [][]float64) {
	/*------------------------------------------------------
	     Created  :  Song Yezhi   2022-6-13 0:08

	  --------------------------------------------------------
	     Input Parameters   :

	     Output Parameters  :
	  -------------------------------------------------------*/
	M, N := MatDim(A)

	AT = Mat(N, M)

	for i := 0; i < N; i++ {
		for j := 0; j < M; j++ {
			AT[i][j] = A[j][i]
		}
	}
	return
}

func LDL(A [][]float64) (L [][]float64, D []float64) {
	/*------------------------------------------------------
	  !  Author  : Song Yezhi
	  !  verison : 2020-5-24 17:24
	  !
	  !    A=LDL'
	  !  -----------------------------------------------------
	  !  Input  Parameters :
	  !    A
	  !  Output Parameters :
	  !    L
	  !    D
	  ------------------------------------------------------*/
	M, N := MatDim(A)
	if M != N {
		fmt.Printf("the rows and cols is not equal")
	}

	G := Mat(N, N)

	L = Mat(N, N)
	D = Vec(N)

	var i, j, k int

	D[0] = A[0][0]

	var tmp1 float64

	for i = 1; i < N; i++ {
		for j = 0; j <= i-1; j++ {
			tmp1 = 0.0
			for k = 0; k <= j-1; k++ {
				tmp1 = tmp1 + G[i][k]*L[j][k]
			}
			G[i][j] = A[i][j] - tmp1
		}

		for j = 0; j <= i-1; j++ {
			L[i][j] = G[i][j] / D[j]
		}

		//comput D[i]
		tmp1 = 0.0

		for k = 0; k <= i-1; k++ {
			tmp1 = tmp1 + G[i][k]*L[i][k]
		}

		D[i] = A[i][i] - tmp1

	}

	//matOutput(L)

	//set the diag elements
	for i = 0; i < N; i++ {
		L[i][i] = 1.0
	}
	return
}

func MatMul(A [][]float64, B [][]float64) (C [][]float64) {
	/*------------------------------------------------------
	!  Author  : Song Yezhi
	!  verison : 2020-5-23 23:27
	!    C = A*B
	!
	!  -----------------------------------------------------
	!  Input  Parameters :
	!     A[M][N]
	!     B[N][P]
	!  Output Parameters :
	!     C[M][P]
	------------------------------------------------------*/
	M := len(A)
	N := len(A[M-1])
	P := len(B[N-1])

	var i, j, k int

	C = Mat(M, P)
	var sum float64

	for i = 0; i < M; i++ {
		for j = 0; j < P; j++ {
			sum = 0.0
			for k = 0; k < N; k++ {
				sum = sum + A[i][k]*B[k][j]
				C[i][j] = sum
			}
		}
	}

	return
}

func VecDot(x1 []float64, x2 []float64) (dot float64) {
	/*------------------------------------------------------
	!  Author  : Song Yezhi
	!  verison : 2020-5-19 22:53
	!
	!     dot product
	------------------------------------------------------*/
	N := len(x1)
	var i int
	dot = 0.0
	for i = 0; i < N; i++ {
		dot = dot + x1[i]*x2[i]
	}
	return
}

func MatVec(A [][]float64, x []float64) (b []float64) {
	/*------------------------------------------------------
	!  Author  : Song Yezhi
	!  verison : 2020-5-19 22:25
	!      b = Ax
	!
	!  -----------------------------------------------------
	!  Input  Parameters :
	!     A[M][N] ----MATRIX
	!  Output Parameters :
	!     b[M]-----vector
	------------------------------------------------------*/
	M := len(A)
	N := len(A[0])
	//the size of mat

	//b = make([]float64, M)

	b = Vec(M)

	var sum float64

	for i := 0; i < M; i++ {
		sum = 0.0
		for j := 0; j < N; j++ {
			sum = sum + A[i][j]*x[j]
		}
		b[i] = sum
	}

	return
}