统计机器学习
附录:KNN代码
这里给出源代码,仅供参考。 如对其他代码感兴趣,欢迎与我联系。 这里给出部分算法的源代码。代码由go语言编写。这些代码没有经过详细测试,所以可能有bug。
另外,对一些公开的数据测试中,我发现国外一些文献中同样的算法机器学习性能后比我的识别准确率要高。 因此,程序可能还有一些问题。请谨慎使用。
package main
import (
"bufio"
"fmt"
"io"
"math"
"os"
"strconv"
"strings"
)
func main() {
/*------------------------------------------------------
Created : Song Yezhi 2022-6-13 2:40
machine learning test
--------------------------------------------------------
Input Parameters :
Output Parameters :
--------------------------------------------------------
Email : song.yz@foxmail.com
Copyrigt (C) : Chinese Academy of Sciences
All rights reserved, 2022
-------------------------------------------------------*/
test()
}
func test() {
finName := "training.txt"
//tranning data
var sigma []float64
var catalog []string
sigma, catalog, lenAttribute, Nlength := dataInfo(finName)
// begin test
ftestName := "test.txt"
ftest, _ := os.Open(ftestName)
defer ftest.Close()
br := bufio.NewReader(ftest)
objAttribute := Vec(lenAttribute)
i := 0
k := 0
for {
line, ioerr := br.ReadString('\n')
if ioerr == io.EOF {
break
}
// in case of no content
if strings.TrimSpace(line) == "" {
continue
}
word := strings.Split(line, ",")
for i := 0; i < lenAttribute; i++ {
objAttribute[i], _ = strconv.ParseFloat(strings.TrimSpace(word[i]), 64)
}
objclass := MKNN(finName, objAttribute, 5, sigma, catalog, lenAttribute, Nlength)
i = i + 1
fmt.Printf("the %d object predicion class is %s , the real object is %s \n", i, objclass, word[lenAttribute])
if strings.TrimSpace(objclass) == strings.TrimSpace(word[lenAttribute]) {
k = k + 1
}
}
fmt.Printf("The prediction accuracy is %f \n", float64(k)/float64(i))
}
func MKNN(finName string, objAttribute []float64, K int, sigma []float64, catalog []string,
lenAttribute int, Nlength int) (objclass string) {
/* ------------------------------------------------------
Created : Song Yezhi 2022-6-13 0:44
modified KNN Classification
Ref: "data mining and analysis"
--------------------------------------------------------
Input Parameters :
objAttribute[] ----- attribute of test object
K ----- the number of the nearest sample
catalog ---------- catalog in string form
lenAttribute---- the length of attribute
Nlength ---- the number of the traning data
Output Parameters :
objclass ---- the result of whic class respect to objAttribute
--------------------------------------------------------
Email : song.yz@foxmail.com
Copyrigt (C) : Chinese Academy of Sciences
All rights reserved, 2022
------------------------------------------------------- */
debug := 0
nCatalog := len(catalog)
fin, _ := os.Open(finName)
defer fin.Close()
br := bufio.NewReader(fin)
// distance for the distance
// sampleClass for the class of this sample
distance := Vec(Nlength)
sampleClass := make([]string, Nlength)
sampleID := make([]int, Nlength)
var distanceX float64
j := 0
for {
line, ioerr := br.ReadString('\n')
if ioerr == io.EOF {
break
}
// in case of no content
if strings.TrimSpace(line) == "" {
continue
}
//the comment
if line[0] == '#' {
continue
}
word := strings.Split(line, ",")
distanceX = 0.0
for i := 0; i < lenAttribute; i++ {
tmp, _ := strconv.ParseFloat(strings.TrimSpace(word[i]), 64)
distanceX = distanceX + (objAttribute[i]-tmp)*(objAttribute[i]-tmp)/(sigma[i]*sigma[i])
//distanceX = distanceX + (objAttribute[i]-tmp)*(objAttribute[i]-tmp)
// statistical distance
}
distance[j] = math.Sqrt(distanceX)
sampleClass[j] = word[lenAttribute]
// which class
sampleID[j] = j
j = j + 1
}
// finish compute the disantce and save to the slice
//-------------------------------------------------------
if debug == 1 {
VecOutput(distance)
}
candidateCatalog := []string{}
candidateID := []int{}
//
var indMin int
for i := 0; i < K; i++ {
indMin = getMin(distance)
candidateCatalog = append(candidateCatalog, sampleClass[indMin])
candidateID = append(candidateID, sampleID[indMin])
distance[indMin] = 1e40
//after get the smallest element, then set it to a
// huge number that in next iteration it will skip this element
}
//
if debug == 1 {
fmt.Println("candidate ID =")
fmt.Println(candidateID)
fmt.Println("candidateCatalog =")
fmt.Println(candidateCatalog)
}
candidateNumber := make([]int, nCatalog)
for k := 0; k < nCatalog; k++ {
for _, value := range candidateCatalog {
if value == catalog[k] {
candidateNumber[k] = candidateNumber[k] + 1
}
}
}
indMax := getMaxInt(candidateNumber)
objclass = catalog[indMax]
return
}
func dataInfo(finName string) (sigma []float64, catalog []string,
lenAttribute int, Nlength int) {
/*------------------------------------------------------
Created : Song Yezhi 2022.06.14
get basic information of the training data
--------------------------------------------------------
Input Parameters :
finName --- training data file name
Output Parameters :
sigma ---- a slice for each attribute
catalog ----
lenAttribute ----- how many attributes
Nlength --- how many samples
--------------------------------------------------------
Email : song.yz@foxmail.com
Copyrigt (C) : Chinese Academy of Sciences
All rights reserved, 2022
-------------------------------------------------------*/
catalog, lenAttribute = GetCatalog(finName)
mean := Vec(lenAttribute)
fin, _ := os.Open(finName)
defer fin.Close()
br := bufio.NewReader(fin)
Nlength = 0
for {
line, ioerr := br.ReadString('\n')
if ioerr == io.EOF {
break
}
// in case of no content
if strings.TrimSpace(line) == "" {
continue
}
//the comment
if line[0] == '#' {
continue
}
Nlength = Nlength + 1
// in case of no content
if strings.TrimSpace(line) == "" {
continue
}
word := strings.Split(line, ",")
for i := 0; i < lenAttribute; i++ {
tmp, _ := strconv.ParseFloat(word[i], 64)
mean[i] = mean[i] + tmp
}
}
for i := 0; i < lenAttribute; i++ {
mean[i] = mean[i] / float64(Nlength)
}
_, _ = fin.Seek(0, io.SeekStart)
//set the file point to the beginning of the file
//----------------------
sigma = Vec(lenAttribute)
// initial of the sigma ,all the elements set to zero
for {
line, ioerr := br.ReadString('\n')
if ioerr == io.EOF {
break
}
// in case of no content
if strings.TrimSpace(line) == "" {
continue
}
word := strings.Split(line, ",")
var wordFloat []float64
wordFloat = Vec(lenAttribute)
// temp vector for the
for i := 0; i < lenAttribute; i++ {
wordFloat[i], _ = strconv.ParseFloat(strings.TrimSpace(word[i]), 64)
sigma[i] = sigma[i] + (wordFloat[i]-mean[i])*(wordFloat[i]-mean[i])
}
}
for i := 0; i < lenAttribute; i++ {
sigma[i] = math.Sqrt(sigma[i] * 1.0 / float64(Nlength-1))
}
return
}
func getMin(v []float64) (iMin int) {
/*------------------------------------------------------
Created : Song Yezhi 2022-6-13 1:31
get the index of the smallest element from a slice
--------------------------------------------------------
Input Parameters :
v ---- float slice
Output Parameters :
ind -- index of the largest element
-------------------------------------------------------*/
iMin = 0
vmin := v[0]
N := len(v)
for i := 1; i < N; i++ {
if v[i] < vmin {
iMin = i
vmin = v[i]
}
}
return
}
func getMaxInt(v []int) (imax int) {
/*------------------------------------------------------
Created : Song Yezhi 2022-6-13 1:31
get the index of the largest element from a slice
--------------------------------------------------------
Input Parameters :
v ---- float slice
Output Parameters :
ind -- index of the largest element
-------------------------------------------------------*/
imax = 0
vmax := v[0]
N := len(v)
for i := 1; i < N; i++ {
if v[i] > vmax {
imax = i
vmax = v[i]
}
}
return
}
func GetCatalog(finName string) (catalog []string, lenAttribute int) {
/*------------------------------------------------------
Created : Song Yezhi 2022-6-11 23:43
get the classies of the training data
--------------------------------------------------------
Input Parameters :
finName---- training data file name
Output Parameters :
catalog---- string, classies of the data
lenAttribute ---- int ,how many attribute of the data without response varaiable
--------------------------------------------------------
Email : song.yz@foxmail.com
Copyrigt (C) : Chinese Academy of Sciences
All rights reserved, 2022
-------------------------------------------------------*/
fin, _ := os.Open(finName)
defer fin.Close()
br := bufio.NewReader(fin)
cataExist := 0
for {
line, ioerr := br.ReadString('\n')
if ioerr == io.EOF {
break
}
// in case of no content
if strings.TrimSpace(line) == "" {
continue
}
word := strings.Split(line, ",")
lenAttribute = len(word) - 1
//without response var
cataLog := word[len(word)-1]
cataExist = 0
for i := 0; i < len(catalog); i++ {
if cataLog == catalog[i] {
cataExist = 1
}
}
if cataExist == 0 {
catalog = append(catalog, cataLog)
}
}
return
}
func Vec(N int) (V []float64) {
/*------------------------------------------------------
! Author : Song Yezhi
! verison : 2020-5-24 17:50
!
!
! -----------------------------------------------------
! Input Parameters :
!
! Output Parameters :
!
------------------------------------------------------*/
V = make([]float64, N)
for i := 0; i < N; i++ {
V[i] = 0.0
}
return
}
func Mat(M int, N int) (A [][]float64) {
/*------------------------------------------------------
! Author : Song Yezhi
! verison : 2020-5-23 23:43
!
! creat a two dimention slices (matrix)
! -----------------------------------------------------
! Input Parameters :
!
! Output Parameters :
!
------------------------------------------------------*/
A = make([][]float64, M)
for i := 0; i < M; i++ {
A[i] = make([]float64, N)
}
for i := 0; i < M; i++ {
for j := 0; j < N; j++ {
A[i][j] = 0.0
}
}
return
}
func MatOutput(A [][]float64) {
/*------------------------------------------------------
! Author : Song Yezhi
! verison : 2020-5-24 16:54
!
------------------------------------------------------*/
M, N := MatDim(A)
for i := 0; i < M; i++ {
for j := 0; j < N; j++ {
fmt.Printf("%18.6f ", A[i][j])
}
fmt.Printf("\n")
}
}
func VecOutput(V []float64) {
N := len(V)
for i := 0; i < N; i++ {
fmt.Printf("%18.6f\n", V[i])
}
}
func MatDim(A [][]float64) (M int, N int) {
/*------------------------------------------------------
! Author : Song Yezhi
! verison : 2020-5-24 12:45
------------------------------------------------------*/
M = len(A)
N = len(A[M-1])
return
}