Go语言自然语言处理:文本处理与分析
Go语言自然语言处理:文本处理与分析
引言
自然语言处理(NLP)是人工智能的重要分支,它使计算机能够理解、处理和生成人类语言。Go语言以其高性能和并发能力,成为构建NLP应用的理想选择。本文将介绍如何使用Go语言进行自然语言处理。
一、文本处理基础
1.1 字符串操作
package main import ( "fmt" "strings" "unicode" ) func main() { text := "Hello, World! 你好,世界!" // 字符串长度 fmt.Printf("长度: %d\n", len(text)) // 转换为小写 fmt.Printf("小写: %s\n", strings.ToLower(text)) // 转换为大写 fmt.Printf("大写: %s\n", strings.ToUpper(text)) // 拆分字符串 words := strings.Fields(text) fmt.Printf("分词: %v\n", words) // 替换 replaced := strings.ReplaceAll(text, "World", "Go") fmt.Printf("替换后: %s\n", replaced) }1.2 Unicode处理
package main import ( "fmt" "unicode" ) func main() { text := "Hello 世界 123 !" // 遍历字符 for _, r := range text { fmt.Printf("%c - 类型: ", r) switch { case unicode.IsLetter(r): fmt.Println("字母") case unicode.IsDigit(r): fmt.Println("数字") case unicode.IsSpace(r): fmt.Println("空格") default: fmt.Println("其他") } } }二、分词处理
2.1 英文分词
package main import ( "fmt" "regexp" "strings" ) func tokenize(text string) []string { // 移除标点符号 re := regexp.MustCompile(`[^\w\s]`) cleaned := re.ReplaceAllString(text, "") // 转换为小写并分词 words := strings.Fields(strings.ToLower(cleaned)) return words } func main() { text := "Hello, World! This is a test sentence." tokens := tokenize(text) fmt.Printf("分词结果: %v\n", tokens) }2.2 中文分词
go get github.com/go-ego/gsepackage main import ( "fmt" "github.com/go-ego/gse" ) func main() { seg := gse.New() // 加载词典 err := seg.LoadDict("zh") if err != nil { panic(err) } text := "我爱北京天安门" words := seg.Cut(text, true) fmt.Printf("中文分词结果: %v\n", words) }三、词频统计
3.1 基础词频统计
package main import ( "fmt" "sort" "strings" ) func wordFrequency(text string) map[string]int { words := strings.Fields(strings.ToLower(text)) freq := make(map[string]int) for _, word := range words { freq[word]++ } return freq } func sortByFrequency(freq map[string]int) []string { words := make([]string, 0, len(freq)) for word := range freq { words = append(words, word) } sort.Slice(words, func(i, j int) bool { return freq[words[i]] > freq[words[j]] }) return words } func main() { text := "Hello world! Hello Go! Go is great. Go is fun." freq := wordFrequency(text) sortedWords := sortByFrequency(freq) fmt.Println("词频统计:") for _, word := range sortedWords { fmt.Printf("%s: %d\n", word, freq[word]) } }3.2 TF-IDF计算
package main import ( "fmt" "math" ) func computeTF(term string, doc []string) float64 { count := 0 for _, word := range doc { if word == term { count++ } } return float64(count) / float64(len(doc)) } func computeIDF(term string, docs [][]string) float64 { docCount := 0 for _, doc := range docs { for _, word := range doc { if word == term { docCount++ break } } } return math.Log(float64(len(docs)) / float64(docCount+1)) } func computeTFIDF(term string, doc []string, docs [][]string) float64 { tf := computeTF(term, doc) idf := computeIDF(term, docs) return tf * idf } func main() { docs := [][]string{ {"hello", "world", "go"}, {"hello", "go", "lang"}, {"world", "programming"}, } term := "go" doc := docs[0] tfidf := computeTFIDF(term, doc, docs) fmt.Printf("TF-IDF for '%s': %.4f\n", term, tfidf) }四、文本分类
4.1 朴素贝叶斯分类器
package main import ( "fmt" "math" ) type NaiveBayesClassifier struct { classCounts map[string]int wordCounts map[string]map[string]int totalDocuments int } func NewNaiveBayesClassifier() *NaiveBayesClassifier { return &NaiveBayesClassifier{ classCounts: make(map[string]int), wordCounts: make(map[string]map[string]int), } } func (nb *NaiveBayesClassifier) Train(docs []string, labels []string) { for i, doc := range docs { label := labels[i] nb.classCounts[label]++ nb.totalDocuments++ if _, ok := nb.wordCounts[label]; !ok { nb.wordCounts[label] = make(map[string]int) } words := strings.Fields(strings.ToLower(doc)) for _, word := range words { nb.wordCounts[label][word]++ } } } func (nb *NaiveBayesClassifier) Predict(doc string) string { words := strings.Fields(strings.ToLower(doc)) bestClass := "" bestScore := math.Inf(-1) for class := range nb.classCounts { score := math.Log(float64(nb.classCounts[class]) / float64(nb.totalDocuments)) for _, word := range words { wordCount := nb.wordCounts[class][word] score += math.Log(float64(wordCount+1) / float64(nb.classCounts[class]+len(nb.wordCounts[class]))) } if score > bestScore { bestScore = score bestClass = class } } return bestClass } func main() { nb := NewNaiveBayesClassifier() docs := []string{ "I love this movie", "Great film, highly recommend", "Terrible movie, waste of time", "Hated every minute of it", "Excellent performance", "Poor acting, bad script", } labels := []string{"positive", "positive", "negative", "negative", "positive", "negative"} nb.Train(docs, labels) testDoc := "This movie was amazing" prediction := nb.Predict(testDoc) fmt.Printf("预测结果: %s\n", prediction) }五、文本生成
5.1 马尔可夫链文本生成
package main import ( "fmt" "math/rand" "strings" "time" ) type MarkovChain struct { transitions map[string][]string order int } func NewMarkovChain(order int) *MarkovChain { rand.Seed(time.Now().UnixNano()) return &MarkovChain{ transitions: make(map[string][]string), order: order, } } func (mc *MarkovChain) Train(text string) { words := strings.Fields(strings.ToLower(text)) for i := 0; i <= len(words)-mc.order-1; i++ { key := strings.Join(words[i:i+mc.order], " ") nextWord := words[i+mc.order] mc.transitions[key] = append(mc.transitions[key], nextWord) } } func (mc *MarkovChain) Generate(length int) string { // 随机选择起始状态 keys := make([]string, 0, len(mc.transitions)) for key := range mc.transitions { keys = append(keys, key) } if len(keys) == 0 { return "" } current := keys[rand.Intn(len(keys))] result := strings.Split(current, " ") for i := 0; i < length-mc.order; i++ { nextWords := mc.transitions[current] if len(nextWords) == 0 { break } nextWord := nextWords[rand.Intn(len(nextWords))] result = append(result, nextWord) // 更新当前状态 current = strings.Join(result[len(result)-mc.order:], " ") } return strings.Join(result, " ") } func main() { text := `I love Go programming. Go is a great language. Go is fast and efficient. I love programming in Go.` mc := NewMarkovChain(2) mc.Train(text) generated := mc.Generate(10) fmt.Printf("生成文本: %s\n", generated) }六、情感分析
6.1 简单情感分析
package main import ( "fmt" "strings" ) var positiveWords = map[string]bool{ "love": true, "great": true, "excellent": true, "amazing": true, "good": true, "best": true, "wonderful": true, "fantastic": true, } var negativeWords = map[string]bool{ "hate": true, "terrible": true, "bad": true, "awful": true, "worst": true, "poor": true, "horrible": true, } func analyzeSentiment(text string) float64 { words := strings.Fields(strings.ToLower(text)) positiveCount := 0 negativeCount := 0 for _, word := range words { if positiveWords[word] { positiveCount++ } if negativeWords[word] { negativeCount++ } } total := positiveCount + negativeCount if total == 0 { return 0 } return float64(positiveCount-negativeCount) / float64(total) } func main() { texts := []string{ "I love this movie, it's amazing!", "Terrible experience, hated it.", "It was okay, not great but not bad.", } for _, text := range texts { score := analyzeSentiment(text) sentiment := "中性" if score > 0.3 { sentiment = "正面" } else if score < -0.3 { sentiment = "负面" } fmt.Printf("文本: %s\n情感得分: %.2f (%s)\n\n", text, score, sentiment) } }七、命名实体识别
7.1 基于规则的NER
package main import ( "fmt" "regexp" "strings" ) type Entity struct { Text string Type string Start int End int } func extractEntities(text string) []Entity { var entities []Entity // 匹配邮箱 emailRegex := regexp.MustCompile(`[\w.-]+@[\w.-]+\.\w+`) for _, match := range emailRegex.FindAllStringIndex(text, -1) { entities = append(entities, Entity{ Text: text[match[0]:match[1]], Type: "EMAIL", Start: match[0], End: match[1], }) } // 匹配电话号码 phoneRegex := regexp.MustCompile(`\d{3,4}[-.]?\d{4}[-.]?\d{4}`) for _, match := range phoneRegex.FindAllStringIndex(text, -1) { entities = append(entities, Entity{ Text: text[match[0]:match[1]], Type: "PHONE", Start: match[0], End: match[1], }) } // 匹配网址 urlRegex := regexp.MustCompile(`https?://[\w.-]+(?:/[\w./-]*)?`) for _, match := range urlRegex.FindAllStringIndex(text, -1) { entities = append(entities, Entity{ Text: text[match[0]:match[1]], Type: "URL", Start: match[0], End: match[1], }) } return entities } func main() { text := `联系我们: support@example.com 或拨打 123-4567-8900 更多信息请访问 https://www.example.com/products` entities := extractEntities(text) fmt.Println("提取的实体:") for _, entity := range entities { fmt.Printf("类型: %s, 文本: %s, 位置: [%d-%d]\n", entity.Type, entity.Text, entity.Start, entity.End) } }八、文本相似度
8.1 余弦相似度
package main import ( "fmt" "math" "strings" ) func tokenize(text string) map[string]int { words := strings.Fields(strings.ToLower(text)) freq := make(map[string]int) for _, word := range words { freq[word]++ } return freq } func dotProduct(v1, v2 map[string]int) int { sum := 0 for word, count := range v1 { sum += count * v2[word] } return sum } func magnitude(v map[string]int) float64 { sum := 0 for _, count := range v { sum += count * count } return math.Sqrt(float64(sum)) } func cosineSimilarity(text1, text2 string) float64 { v1 := tokenize(text1) v2 := tokenize(text2) dot := dotProduct(v1, v2) mag1 := magnitude(v1) mag2 := magnitude(v2) if mag1 == 0 || mag2 == 0 { return 0 } return float64(dot) / (mag1 * mag2) } func main() { text1 := "I love programming in Go" text2 := "Go is a great programming language" text3 := "Cats are cute animals" similarity12 := cosineSimilarity(text1, text2) similarity13 := cosineSimilarity(text1, text3) fmt.Printf("文本1与文本2相似度: %.4f\n", similarity12) fmt.Printf("文本1与文本3相似度: %.4f\n", similarity13) }九、实战:文本搜索引擎
package main import ( "fmt" "sort" "strings" ) type Document struct { ID int Title string Body string } type SearchEngine struct { documents []Document index map[string][]int } func NewSearchEngine() *SearchEngine { return &SearchEngine{ documents: make([]Document, 0), index: make(map[string][]int), } } func (se *SearchEngine) AddDocument(doc Document) { se.documents = append(se.documents, doc) docID := len(se.documents) - 1 words := strings.Fields(strings.ToLower(doc.Title + " " + doc.Body)) seen := make(map[string]bool) for _, word := range words { if !seen[word] { se.index[word] = append(se.index[word], docID) seen[word] = true } } } func (se *SearchEngine) Search(query string) []Document { queryWords := strings.Fields(strings.ToLower(query)) // 找到包含所有查询词的文档 var resultIDs []int for i, word := range queryWords { if docIDs, ok := se.index[word]; ok { if i == 0 { resultIDs = docIDs } else { // 求交集 resultIDs = intersect(resultIDs, docIDs) } } else { return []Document{} } } // 获取文档 results := make([]Document, 0, len(resultIDs)) for _, id := range resultIDs { results = append(results, se.documents[id]) } return results } func intersect(a, b []int) []int { result := make([]int, 0) i, j := 0, 0 for i < len(a) && j < len(b) { if a[i] == b[j] { result = append(result, a[i]) i++ j++ } else if a[i] < b[j] { i++ } else { j++ } } return result } func main() { se := NewSearchEngine() se.AddDocument(Document{Title: "Go Programming", Body: "Go is a programming language created by Google"}) se.AddDocument(Document{Title: "Machine Learning", Body: "Machine learning is a subset of AI"}) se.AddDocument(Document{Title: "Go and AI", Body: "Go can be used for AI and machine learning"}) results := se.Search("Go programming") fmt.Println("搜索结果:") for _, doc := range results { fmt.Printf("标题: %s\n内容: %s\n\n", doc.Title, doc.Body) } }十、总结
本文介绍了如何使用Go语言进行自然语言处理,包括:
- 文本处理基础:字符串操作、Unicode处理
- 分词处理:英文分词和中文分词
- 词频统计:基础词频和TF-IDF计算
- 文本分类:朴素贝叶斯分类器
- 文本生成:马尔可夫链文本生成
- 情感分析:基于词典的情感分析
- 命名实体识别:基于规则的NER
- 文本相似度:余弦相似度计算
- 实战项目:简单文本搜索引擎
通过这些实现,你可以使用Go语言构建各种NLP应用,充分利用Go的性能优势处理大规模文本数据。
