song.yz@foxmail.com wechat: math-box

统计机器学习

附录:KNN代码



这里给出源代码,仅供参考。 如对其他代码感兴趣,欢迎与我联系。 这里给出部分算法的源代码。代码由go语言编写。这些代码没有经过详细测试,所以可能有bug。

另外,对一些公开的数据测试中,我发现国外一些文献中同样的算法机器学习性能后比我的识别准确率要高。 因此,程序可能还有一些问题。请谨慎使用。

package main

import (
	"bufio"
	"fmt"
	"io"
	"math"
	"os"
	"strconv"
	"strings"
)

func main() {
	/*------------------------------------------------------
	     Created  :  Song Yezhi   2022-6-13 2:40
	          machine learning  test
	  --------------------------------------------------------
	     Input Parameters   :

	     Output Parameters  :

	  --------------------------------------------------------
	     Email        : song.yz@foxmail.com
	     Copyrigt (C) : Chinese Academy of Sciences
	                    All rights reserved,  2022
	  -------------------------------------------------------*/
	test()

}

func test() {

	finName := "training.txt"
	//tranning data

	var sigma []float64
	var catalog []string

	sigma, catalog, lenAttribute, Nlength := dataInfo(finName)

	// begin test
	ftestName := "test.txt"
	ftest, _ := os.Open(ftestName)
	defer ftest.Close()

	br := bufio.NewReader(ftest)

	objAttribute := Vec(lenAttribute)

	i := 0
	k := 0
	for {

		line, ioerr := br.ReadString('\n')
		if ioerr == io.EOF {
			break
		}

		// in case of no content
		if strings.TrimSpace(line) == "" {
			continue
		}

		word := strings.Split(line, ",")

		for i := 0; i < lenAttribute; i++ {
			objAttribute[i], _ = strconv.ParseFloat(strings.TrimSpace(word[i]), 64)
		}

		objclass := MKNN(finName, objAttribute, 5, sigma, catalog, lenAttribute, Nlength)

		i = i + 1
		fmt.Printf("the %d object predicion class is %s , the real object is %s \n", i, objclass, word[lenAttribute])

		if strings.TrimSpace(objclass) == strings.TrimSpace(word[lenAttribute]) {
			k = k + 1
		}

	}

	fmt.Printf("The prediction accuracy is %f \n", float64(k)/float64(i))

}

func MKNN(finName string, objAttribute []float64, K int, sigma []float64, catalog []string,
	lenAttribute int, Nlength int) (objclass string) {
	/* ------------------------------------------------------
	      Created  :  Song Yezhi   2022-6-13 0:44
	        modified  KNN Classification

	          Ref:  "data mining and analysis"
	   --------------------------------------------------------
	      Input Parameters   :
	           objAttribute[]  -----  attribute of test object
	           K -----  the number of the nearest sample
	           catalog  ----------  catalog in string form
	           lenAttribute---- the length of attribute
	           Nlength ---- the number of the traning data

	      Output Parameters  :
	           objclass  ---- the result of whic class  respect to objAttribute
	   --------------------------------------------------------
	      Email        : song.yz@foxmail.com
	      Copyrigt (C) : Chinese Academy of Sciences
	                     All rights reserved,  2022
	   ------------------------------------------------------- */

	debug := 0

	nCatalog := len(catalog)

	fin, _ := os.Open(finName)
	defer fin.Close()

	br := bufio.NewReader(fin)

	// distance for the distance
	// sampleClass for the class of this sample
	distance := Vec(Nlength)
	sampleClass := make([]string, Nlength)
	sampleID := make([]int, Nlength)

	var distanceX float64

	j := 0
	for {

		line, ioerr := br.ReadString('\n')
		if ioerr == io.EOF {
			break
		}

		// in case of no content
		if strings.TrimSpace(line) == "" {
			continue
		}

		//the comment
		if line[0] == '#' {
			continue
		}

		word := strings.Split(line, ",")

		distanceX = 0.0

		for i := 0; i < lenAttribute; i++ {

			tmp, _ := strconv.ParseFloat(strings.TrimSpace(word[i]), 64)

			distanceX = distanceX + (objAttribute[i]-tmp)*(objAttribute[i]-tmp)/(sigma[i]*sigma[i])
			//distanceX = distanceX +  (objAttribute[i]-tmp)*(objAttribute[i]-tmp)
			// statistical distance

		}

		distance[j] = math.Sqrt(distanceX)

		sampleClass[j] = word[lenAttribute]
		// which class

		sampleID[j] = j

		j = j + 1

	}

	// finish compute the disantce and  save to the slice
	//-------------------------------------------------------

	if debug == 1 {
		VecOutput(distance)
	}

	candidateCatalog := []string{}
	candidateID := []int{}
	//

	var indMin int

	for i := 0; i < K; i++ {

		indMin = getMin(distance)

		candidateCatalog = append(candidateCatalog, sampleClass[indMin])
		candidateID = append(candidateID, sampleID[indMin])

		distance[indMin] = 1e40
		//after get the smallest element, then set it to a
		// huge number that in next iteration it will skip this element

	}

	//
	if debug == 1 {
		fmt.Println("candidate ID =")
		fmt.Println(candidateID)

		fmt.Println("candidateCatalog =")
		fmt.Println(candidateCatalog)

	}

	candidateNumber := make([]int, nCatalog)

	for k := 0; k < nCatalog; k++ {

		for _, value := range candidateCatalog {

			if value == catalog[k] {
				candidateNumber[k] = candidateNumber[k] + 1

			}

		}

	}

	indMax := getMaxInt(candidateNumber)

	objclass = catalog[indMax]

	return
}

func dataInfo(finName string) (sigma []float64, catalog []string,
	lenAttribute int, Nlength int) {
	/*------------------------------------------------------
	     Created  :  Song Yezhi   2022.06.14
	          get basic information of the training data
	  --------------------------------------------------------
	     Input Parameters   :
	          finName --- training data file name

	     Output Parameters  :
	          sigma ---- a  slice for each attribute
	          catalog ----
	          lenAttribute ----- how many attributes
	          Nlength  ---  how many samples
	  --------------------------------------------------------
	     Email        : song.yz@foxmail.com
	     Copyrigt (C) : Chinese Academy of Sciences
	                    All rights reserved,  2022
	  -------------------------------------------------------*/

	catalog, lenAttribute = GetCatalog(finName)

	mean := Vec(lenAttribute)

	fin, _ := os.Open(finName)
	defer fin.Close()

	br := bufio.NewReader(fin)

	Nlength = 0

	for {

		line, ioerr := br.ReadString('\n')
		if ioerr == io.EOF {
			break
		}

		// in case of no content
		if strings.TrimSpace(line) == "" {
			continue
		}

		//the comment
		if line[0] == '#' {
			continue
		}

		Nlength = Nlength + 1

		// in case of no content
		if strings.TrimSpace(line) == "" {
			continue
		}

		word := strings.Split(line, ",")

		for i := 0; i < lenAttribute; i++ {

			tmp, _ := strconv.ParseFloat(word[i], 64)

			mean[i] = mean[i] + tmp

		}

	}

	for i := 0; i < lenAttribute; i++ {
		mean[i] = mean[i] / float64(Nlength)
	}

	_, _ = fin.Seek(0, io.SeekStart)
	//set the file point to the beginning of the file
	//----------------------

	sigma = Vec(lenAttribute)
	// initial of the sigma  ,all the elements set to zero

	for {

		line, ioerr := br.ReadString('\n')
		if ioerr == io.EOF {
			break
		}

		// in case of no content
		if strings.TrimSpace(line) == "" {
			continue
		}

		word := strings.Split(line, ",")

		var wordFloat []float64

		wordFloat = Vec(lenAttribute)
		// temp vector for the

		for i := 0; i < lenAttribute; i++ {
			wordFloat[i], _ = strconv.ParseFloat(strings.TrimSpace(word[i]), 64)

			sigma[i] = sigma[i] + (wordFloat[i]-mean[i])*(wordFloat[i]-mean[i])

		}

	}

	for i := 0; i < lenAttribute; i++ {
		sigma[i] = math.Sqrt(sigma[i] * 1.0 / float64(Nlength-1))
	}

	return
}

func getMin(v []float64) (iMin int) {
	/*------------------------------------------------------
	     Created  :  Song Yezhi   2022-6-13 1:31

	            get the index of the smallest element   from a slice
	  --------------------------------------------------------
	     Input Parameters   :
	            v ---- float slice
	     Output Parameters  :
	            ind -- index of the largest element
	  -------------------------------------------------------*/
	iMin = 0

	vmin := v[0]

	N := len(v)

	for i := 1; i < N; i++ {

		if v[i] < vmin {

			iMin = i

			vmin = v[i]

		}

	}

	return
}

func getMaxInt(v []int) (imax int) {
	/*------------------------------------------------------
	     Created  :  Song Yezhi   2022-6-13 1:31

	            get the index of the largest element   from a slice
	  --------------------------------------------------------
	     Input Parameters   :
	            v ---- float slice
	     Output Parameters  :
	            ind -- index of the largest element
	  -------------------------------------------------------*/
	imax = 0
	vmax := v[0]
	N := len(v)
	for i := 1; i < N; i++ {
		if v[i] > vmax {
			imax = i
			vmax = v[i]
		}
	}
	return
}

func GetCatalog(finName string) (catalog []string, lenAttribute int) {
	/*------------------------------------------------------
	     Created  :  Song Yezhi   2022-6-11 23:43
	            get the classies of the training data
	  --------------------------------------------------------
	     Input Parameters   :
	            finName---- training data file name
	     Output Parameters  :
	            catalog---- string, classies of the data
	            lenAttribute ---- int ,how many attribute of the data without response varaiable
	  --------------------------------------------------------
	     Email        : song.yz@foxmail.com
	     Copyrigt (C) : Chinese Academy of Sciences
	                    All rights reserved,  2022
	  -------------------------------------------------------*/

	fin, _ := os.Open(finName)
	defer fin.Close()

	br := bufio.NewReader(fin)

	cataExist := 0

	for {
		line, ioerr := br.ReadString('\n')
		if ioerr == io.EOF {
			break
		}

		// in case of no content
		if strings.TrimSpace(line) == "" {
			continue
		}

		word := strings.Split(line, ",")

		lenAttribute = len(word) - 1
		//without response var

		cataLog := word[len(word)-1]

		cataExist = 0
		for i := 0; i < len(catalog); i++ {
			if cataLog == catalog[i] {
				cataExist = 1
			}
		}

		if cataExist == 0 {
			catalog = append(catalog, cataLog)
		}

	}

	return

}

func Vec(N int) (V []float64) {
	/*------------------------------------------------------
	!  Author  : Song Yezhi
	!  verison : 2020-5-24 17:50
	!
	!
	!  -----------------------------------------------------
	!  Input  Parameters :
	!
	!  Output Parameters :
	!
	------------------------------------------------------*/
	V = make([]float64, N)
	for i := 0; i < N; i++ {
		V[i] = 0.0
	}
	return
}

func Mat(M int, N int) (A [][]float64) {
	/*------------------------------------------------------
	  !  Author  : Song Yezhi
	  !  verison : 2020-5-23 23:43
	  !
	  !  creat a two dimention slices (matrix)
	  !  -----------------------------------------------------
	  !  Input  Parameters :
	  !
	  !  Output Parameters :
	  !
	  ------------------------------------------------------*/
	A = make([][]float64, M)
	for i := 0; i < M; i++ {
		A[i] = make([]float64, N)
	}

	for i := 0; i < M; i++ {
		for j := 0; j < N; j++ {
			A[i][j] = 0.0
		}
	}

	return
}

func MatOutput(A [][]float64) {
	/*------------------------------------------------------
	!  Author  : Song Yezhi
	!  verison : 2020-5-24 16:54
	!
	------------------------------------------------------*/
	M, N := MatDim(A)
	for i := 0; i < M; i++ {
		for j := 0; j < N; j++ {
			fmt.Printf("%18.6f   ", A[i][j])
		}
		fmt.Printf("\n")
	}
}

func VecOutput(V []float64) {
	N := len(V)
	for i := 0; i < N; i++ {
		fmt.Printf("%18.6f\n", V[i])
	}
}

func MatDim(A [][]float64) (M int, N int) {
	/*------------------------------------------------------
	!  Author  : Song Yezhi
	!  verison : 2020-5-24 12:45
	------------------------------------------------------*/
	M = len(A)
	N = len(A[M-1])
	return
}