当前位置: 首页 > news >正文

Go语言自然语言处理:文本处理与分析

Go语言自然语言处理:文本处理与分析

引言

自然语言处理(NLP)是人工智能的重要分支,它使计算机能够理解、处理和生成人类语言。Go语言以其高性能和并发能力,成为构建NLP应用的理想选择。本文将介绍如何使用Go语言进行自然语言处理。

一、文本处理基础

1.1 字符串操作

package main import ( "fmt" "strings" "unicode" ) func main() { text := "Hello, World! 你好,世界!" // 字符串长度 fmt.Printf("长度: %d\n", len(text)) // 转换为小写 fmt.Printf("小写: %s\n", strings.ToLower(text)) // 转换为大写 fmt.Printf("大写: %s\n", strings.ToUpper(text)) // 拆分字符串 words := strings.Fields(text) fmt.Printf("分词: %v\n", words) // 替换 replaced := strings.ReplaceAll(text, "World", "Go") fmt.Printf("替换后: %s\n", replaced) }

1.2 Unicode处理

package main import ( "fmt" "unicode" ) func main() { text := "Hello 世界 123 !" // 遍历字符 for _, r := range text { fmt.Printf("%c - 类型: ", r) switch { case unicode.IsLetter(r): fmt.Println("字母") case unicode.IsDigit(r): fmt.Println("数字") case unicode.IsSpace(r): fmt.Println("空格") default: fmt.Println("其他") } } }

二、分词处理

2.1 英文分词

package main import ( "fmt" "regexp" "strings" ) func tokenize(text string) []string { // 移除标点符号 re := regexp.MustCompile(`[^\w\s]`) cleaned := re.ReplaceAllString(text, "") // 转换为小写并分词 words := strings.Fields(strings.ToLower(cleaned)) return words } func main() { text := "Hello, World! This is a test sentence." tokens := tokenize(text) fmt.Printf("分词结果: %v\n", tokens) }

2.2 中文分词

go get github.com/go-ego/gse
package main import ( "fmt" "github.com/go-ego/gse" ) func main() { seg := gse.New() // 加载词典 err := seg.LoadDict("zh") if err != nil { panic(err) } text := "我爱北京天安门" words := seg.Cut(text, true) fmt.Printf("中文分词结果: %v\n", words) }

三、词频统计

3.1 基础词频统计

package main import ( "fmt" "sort" "strings" ) func wordFrequency(text string) map[string]int { words := strings.Fields(strings.ToLower(text)) freq := make(map[string]int) for _, word := range words { freq[word]++ } return freq } func sortByFrequency(freq map[string]int) []string { words := make([]string, 0, len(freq)) for word := range freq { words = append(words, word) } sort.Slice(words, func(i, j int) bool { return freq[words[i]] > freq[words[j]] }) return words } func main() { text := "Hello world! Hello Go! Go is great. Go is fun." freq := wordFrequency(text) sortedWords := sortByFrequency(freq) fmt.Println("词频统计:") for _, word := range sortedWords { fmt.Printf("%s: %d\n", word, freq[word]) } }

3.2 TF-IDF计算

package main import ( "fmt" "math" ) func computeTF(term string, doc []string) float64 { count := 0 for _, word := range doc { if word == term { count++ } } return float64(count) / float64(len(doc)) } func computeIDF(term string, docs [][]string) float64 { docCount := 0 for _, doc := range docs { for _, word := range doc { if word == term { docCount++ break } } } return math.Log(float64(len(docs)) / float64(docCount+1)) } func computeTFIDF(term string, doc []string, docs [][]string) float64 { tf := computeTF(term, doc) idf := computeIDF(term, docs) return tf * idf } func main() { docs := [][]string{ {"hello", "world", "go"}, {"hello", "go", "lang"}, {"world", "programming"}, } term := "go" doc := docs[0] tfidf := computeTFIDF(term, doc, docs) fmt.Printf("TF-IDF for '%s': %.4f\n", term, tfidf) }

四、文本分类

4.1 朴素贝叶斯分类器

package main import ( "fmt" "math" ) type NaiveBayesClassifier struct { classCounts map[string]int wordCounts map[string]map[string]int totalDocuments int } func NewNaiveBayesClassifier() *NaiveBayesClassifier { return &NaiveBayesClassifier{ classCounts: make(map[string]int), wordCounts: make(map[string]map[string]int), } } func (nb *NaiveBayesClassifier) Train(docs []string, labels []string) { for i, doc := range docs { label := labels[i] nb.classCounts[label]++ nb.totalDocuments++ if _, ok := nb.wordCounts[label]; !ok { nb.wordCounts[label] = make(map[string]int) } words := strings.Fields(strings.ToLower(doc)) for _, word := range words { nb.wordCounts[label][word]++ } } } func (nb *NaiveBayesClassifier) Predict(doc string) string { words := strings.Fields(strings.ToLower(doc)) bestClass := "" bestScore := math.Inf(-1) for class := range nb.classCounts { score := math.Log(float64(nb.classCounts[class]) / float64(nb.totalDocuments)) for _, word := range words { wordCount := nb.wordCounts[class][word] score += math.Log(float64(wordCount+1) / float64(nb.classCounts[class]+len(nb.wordCounts[class]))) } if score > bestScore { bestScore = score bestClass = class } } return bestClass } func main() { nb := NewNaiveBayesClassifier() docs := []string{ "I love this movie", "Great film, highly recommend", "Terrible movie, waste of time", "Hated every minute of it", "Excellent performance", "Poor acting, bad script", } labels := []string{"positive", "positive", "negative", "negative", "positive", "negative"} nb.Train(docs, labels) testDoc := "This movie was amazing" prediction := nb.Predict(testDoc) fmt.Printf("预测结果: %s\n", prediction) }

五、文本生成

5.1 马尔可夫链文本生成

package main import ( "fmt" "math/rand" "strings" "time" ) type MarkovChain struct { transitions map[string][]string order int } func NewMarkovChain(order int) *MarkovChain { rand.Seed(time.Now().UnixNano()) return &MarkovChain{ transitions: make(map[string][]string), order: order, } } func (mc *MarkovChain) Train(text string) { words := strings.Fields(strings.ToLower(text)) for i := 0; i <= len(words)-mc.order-1; i++ { key := strings.Join(words[i:i+mc.order], " ") nextWord := words[i+mc.order] mc.transitions[key] = append(mc.transitions[key], nextWord) } } func (mc *MarkovChain) Generate(length int) string { // 随机选择起始状态 keys := make([]string, 0, len(mc.transitions)) for key := range mc.transitions { keys = append(keys, key) } if len(keys) == 0 { return "" } current := keys[rand.Intn(len(keys))] result := strings.Split(current, " ") for i := 0; i < length-mc.order; i++ { nextWords := mc.transitions[current] if len(nextWords) == 0 { break } nextWord := nextWords[rand.Intn(len(nextWords))] result = append(result, nextWord) // 更新当前状态 current = strings.Join(result[len(result)-mc.order:], " ") } return strings.Join(result, " ") } func main() { text := `I love Go programming. Go is a great language. Go is fast and efficient. I love programming in Go.` mc := NewMarkovChain(2) mc.Train(text) generated := mc.Generate(10) fmt.Printf("生成文本: %s\n", generated) }

六、情感分析

6.1 简单情感分析

package main import ( "fmt" "strings" ) var positiveWords = map[string]bool{ "love": true, "great": true, "excellent": true, "amazing": true, "good": true, "best": true, "wonderful": true, "fantastic": true, } var negativeWords = map[string]bool{ "hate": true, "terrible": true, "bad": true, "awful": true, "worst": true, "poor": true, "horrible": true, } func analyzeSentiment(text string) float64 { words := strings.Fields(strings.ToLower(text)) positiveCount := 0 negativeCount := 0 for _, word := range words { if positiveWords[word] { positiveCount++ } if negativeWords[word] { negativeCount++ } } total := positiveCount + negativeCount if total == 0 { return 0 } return float64(positiveCount-negativeCount) / float64(total) } func main() { texts := []string{ "I love this movie, it's amazing!", "Terrible experience, hated it.", "It was okay, not great but not bad.", } for _, text := range texts { score := analyzeSentiment(text) sentiment := "中性" if score > 0.3 { sentiment = "正面" } else if score < -0.3 { sentiment = "负面" } fmt.Printf("文本: %s\n情感得分: %.2f (%s)\n\n", text, score, sentiment) } }

七、命名实体识别

7.1 基于规则的NER

package main import ( "fmt" "regexp" "strings" ) type Entity struct { Text string Type string Start int End int } func extractEntities(text string) []Entity { var entities []Entity // 匹配邮箱 emailRegex := regexp.MustCompile(`[\w.-]+@[\w.-]+\.\w+`) for _, match := range emailRegex.FindAllStringIndex(text, -1) { entities = append(entities, Entity{ Text: text[match[0]:match[1]], Type: "EMAIL", Start: match[0], End: match[1], }) } // 匹配电话号码 phoneRegex := regexp.MustCompile(`\d{3,4}[-.]?\d{4}[-.]?\d{4}`) for _, match := range phoneRegex.FindAllStringIndex(text, -1) { entities = append(entities, Entity{ Text: text[match[0]:match[1]], Type: "PHONE", Start: match[0], End: match[1], }) } // 匹配网址 urlRegex := regexp.MustCompile(`https?://[\w.-]+(?:/[\w./-]*)?`) for _, match := range urlRegex.FindAllStringIndex(text, -1) { entities = append(entities, Entity{ Text: text[match[0]:match[1]], Type: "URL", Start: match[0], End: match[1], }) } return entities } func main() { text := `联系我们: support@example.com 或拨打 123-4567-8900 更多信息请访问 https://www.example.com/products` entities := extractEntities(text) fmt.Println("提取的实体:") for _, entity := range entities { fmt.Printf("类型: %s, 文本: %s, 位置: [%d-%d]\n", entity.Type, entity.Text, entity.Start, entity.End) } }

八、文本相似度

8.1 余弦相似度

package main import ( "fmt" "math" "strings" ) func tokenize(text string) map[string]int { words := strings.Fields(strings.ToLower(text)) freq := make(map[string]int) for _, word := range words { freq[word]++ } return freq } func dotProduct(v1, v2 map[string]int) int { sum := 0 for word, count := range v1 { sum += count * v2[word] } return sum } func magnitude(v map[string]int) float64 { sum := 0 for _, count := range v { sum += count * count } return math.Sqrt(float64(sum)) } func cosineSimilarity(text1, text2 string) float64 { v1 := tokenize(text1) v2 := tokenize(text2) dot := dotProduct(v1, v2) mag1 := magnitude(v1) mag2 := magnitude(v2) if mag1 == 0 || mag2 == 0 { return 0 } return float64(dot) / (mag1 * mag2) } func main() { text1 := "I love programming in Go" text2 := "Go is a great programming language" text3 := "Cats are cute animals" similarity12 := cosineSimilarity(text1, text2) similarity13 := cosineSimilarity(text1, text3) fmt.Printf("文本1与文本2相似度: %.4f\n", similarity12) fmt.Printf("文本1与文本3相似度: %.4f\n", similarity13) }

九、实战:文本搜索引擎

package main import ( "fmt" "sort" "strings" ) type Document struct { ID int Title string Body string } type SearchEngine struct { documents []Document index map[string][]int } func NewSearchEngine() *SearchEngine { return &SearchEngine{ documents: make([]Document, 0), index: make(map[string][]int), } } func (se *SearchEngine) AddDocument(doc Document) { se.documents = append(se.documents, doc) docID := len(se.documents) - 1 words := strings.Fields(strings.ToLower(doc.Title + " " + doc.Body)) seen := make(map[string]bool) for _, word := range words { if !seen[word] { se.index[word] = append(se.index[word], docID) seen[word] = true } } } func (se *SearchEngine) Search(query string) []Document { queryWords := strings.Fields(strings.ToLower(query)) // 找到包含所有查询词的文档 var resultIDs []int for i, word := range queryWords { if docIDs, ok := se.index[word]; ok { if i == 0 { resultIDs = docIDs } else { // 求交集 resultIDs = intersect(resultIDs, docIDs) } } else { return []Document{} } } // 获取文档 results := make([]Document, 0, len(resultIDs)) for _, id := range resultIDs { results = append(results, se.documents[id]) } return results } func intersect(a, b []int) []int { result := make([]int, 0) i, j := 0, 0 for i < len(a) && j < len(b) { if a[i] == b[j] { result = append(result, a[i]) i++ j++ } else if a[i] < b[j] { i++ } else { j++ } } return result } func main() { se := NewSearchEngine() se.AddDocument(Document{Title: "Go Programming", Body: "Go is a programming language created by Google"}) se.AddDocument(Document{Title: "Machine Learning", Body: "Machine learning is a subset of AI"}) se.AddDocument(Document{Title: "Go and AI", Body: "Go can be used for AI and machine learning"}) results := se.Search("Go programming") fmt.Println("搜索结果:") for _, doc := range results { fmt.Printf("标题: %s\n内容: %s\n\n", doc.Title, doc.Body) } }

十、总结

本文介绍了如何使用Go语言进行自然语言处理,包括:

  1. 文本处理基础:字符串操作、Unicode处理
  2. 分词处理:英文分词和中文分词
  3. 词频统计:基础词频和TF-IDF计算
  4. 文本分类:朴素贝叶斯分类器
  5. 文本生成:马尔可夫链文本生成
  6. 情感分析:基于词典的情感分析
  7. 命名实体识别:基于规则的NER
  8. 文本相似度:余弦相似度计算
  9. 实战项目:简单文本搜索引擎

通过这些实现,你可以使用Go语言构建各种NLP应用,充分利用Go的性能优势处理大规模文本数据。

http://www.rkmt.cn/news/1424520.html

相关文章:

  • STM32F407标准库实战:串口+DMA收发数据,如何设计一个高效的环形缓冲区管理模块?
  • 你想何出怎样的SRAM CIM
  • 量子视觉场技术:量子计算与计算机视觉的融合创新
  • Python 函数完全指南:定义与调用
  • 网页切图工具,网格切图,非常方便
  • 两个独立事件的联合概率
  • 2026年北京老家具回收机构排行 靠谱之选盘点 - 优质品牌商家
  • 千问大模型在阿里生态中的实战应用指南
  • 收藏!Python小白必看:从零入门大模型,手把手带你掌握企业级实战能力
  • 专访 7 名普通职场人:AI 来了之后,你过得还好吗?
  • 告别风扇噪音与高温:FanControl三分钟搞定Windows散热优化
  • 别再死记硬背Sarsa公式了!用Python手搓一个走迷宫AI,5分钟搞懂On-Policy和Q-learning的区别
  • 工业防爆监控技术解析与山东区域选型实践
  • Windows开始菜单修复终极指南:三步恢复消失的磁贴
  • Codex 新增“宠物”功能:不只是可爱,而是一个轻量工作状态提醒器
  • 工具使用、代理和 Voyager 论文
  • 别再被多重共线性坑了!用Python的sklearn手把手教你调岭回归的alpha参数
  • 2026年嵌丝道口板TOP5厂商盘点 品质与实力对比 - 优质品牌商家
  • 93、CAN FD数据链路层核心:帧结构对比与DLC编码革命
  • 172 号卡哪个推荐码是官方一级?10000 置顶权限真实解析 - 172号卡
  • Lindy自动化项目管理:从概念验证到规模化落地的7个关键决策节点(附20年踩坑清单)
  • 2026年5月更新:浙江老爹鞋制造商业内推荐与趋势解析 - 2026年企业资讯
  • Harness 中的请求影子复制:用于离线分析
  • 我的Obsidian知识库,现在可以自动剪藏笔记到本地了
  • 【从零开始的JUC并发第四章】:JUC常用工具类
  • 新手也能跑通大模型,Hugging Face 环境配置与模型加载指南
  • 5分钟掌握VideoDownloadHelper:你的网页视频下载救星
  • 告别LPC!手把手教你用ESPI协议连接PCH与EC(含信号实测图与模式选择指南)
  • 告别格式返工!okbiye 论文智能排版,一键对齐千校规范,毕业季效率拉满
  • GPU内存稳定性实战指南:深入解析MemtestCL系统教程