互助盤網(wǎng)站開發(fā)萬網(wǎng)域名查詢接口
一.基于Word2vec詞聚類的關(guān)鍵詞步驟
基于Word2Vec的詞聚類關(guān)鍵詞提取包括以下步驟:
1.準備文本數(shù)據(jù):收集或準備文本數(shù)據(jù),可以是單一文檔或文檔集合,涵蓋關(guān)鍵詞提取的領(lǐng)域。2.文本預(yù)處理:清洗文本數(shù)據(jù),去除無關(guān)字符、標點符號,將文本轉(zhuǎn)換為小寫等。進行分詞,將文本劃分為詞語。3.訓(xùn)練Word2Vec模型:使用預(yù)處理后的文本數(shù)據(jù)訓(xùn)練Word2Vec模型??梢允褂矛F(xiàn)有的庫如gensim,也可以自行實現(xiàn)Word2Vec模型的訓(xùn)練。定義模型的參數(shù),如詞向量維度、窗口大小、最小詞頻等。4.獲取詞向量:通過訓(xùn)練好的Word2Vec模型獲取每個詞語的詞向量。5.詞聚類:使用聚類算法對詞向量進行聚類,將相似的詞語分為同一簇。常用的聚類算法包括K-Means、DBSCAN等。定義聚類的數(shù)量(簇數(shù))。6.獲取每個聚類的代表詞:對每個聚類,選擇代表性的詞語作為關(guān)鍵詞。這可以通過計算每個聚類的中心或其他代表性指標來實現(xiàn)。7.輸出關(guān)鍵詞:將每個聚類的代表詞作為關(guān)鍵詞輸出,得到最終的關(guān)鍵詞列表。
整個流程的核心在于使用Word2Vec模型得到詞向量,然后通過聚類算法將相似的詞語歸為一簇,最終提取每個簇的代表性詞語作為關(guān)鍵詞。這種方法能夠捕捉詞語之間的語義關(guān)系,提高關(guān)鍵詞的表達力。
二.基于Word2vec詞聚類的關(guān)鍵詞的代碼實現(xiàn)
詞向量的預(yù)處理
# coding=utf-8
import warnings
warnings.filterwarnings(action='ignore', category=UserWarning, module='gensim') # 忽略警告
import codecs
import pandas as pd
import numpy as np
import jieba # 分詞
import jieba.posseg
import gensim # 加載詞向量模型
# 返回特征詞向量bai
def word_vecs(wordList, model):name = []vecs = []for word in wordList:word = word.replace('\n', '')try:if word in model: # 模型中存在該詞的向量表示name.append(word.encode('utf8').decode("utf-8"))vecs.append(model[word])except KeyError:continuea = pd.DataFrame(name, columns=['word'])b = pd.DataFrame(np.array(vecs, dtype='float'))return pd.concat([a, b], axis=1)
# 數(shù)據(jù)預(yù)處理操作:分詞,去停用詞,詞性篩選
def data_prepare(text, stopkey):l = []# 定義選取的詞性pos = ['n', 'nz', 'v', 'vd', 'vn', 'l', 'a', 'd']seg = jieba.posseg.cut(text) # 分詞for i in seg:# 去重 + 去停用詞 + 詞性篩選if i.word not in l and i.word\not in stopkey and i.flag in pos:# print i.wordl.append(i.word)return l
# 根據(jù)數(shù)據(jù)獲取候選關(guān)鍵詞詞向量
def build_words_vecs(data, stopkey, model):idList, titleList, abstractList = data['id'], data['title'], data['abstract']for index in range(len(idList)):id = idList[index]title = titleList[index]abstract = abstractList[index]l_ti = data_prepare(title, stopkey) # 處理標題l_ab = data_prepare(abstract, stopkey) # 處理摘要# 獲取候選關(guān)鍵詞的詞向量words = np.append(l_ti, l_ab) # 拼接數(shù)組元素words = list(set(words)) # 數(shù)組元素去重,得到候選關(guān)鍵詞列表wordvecs = word_vecs(words, model) # 獲取候選關(guān)鍵詞的詞向量表示# 詞向量寫入csv文件,每個詞400維data_vecs = pd.DataFrame(wordvecs)data_vecs.to_csv('result/vecs/wordvecs_' + str(id) + '.csv', index=False)print ("document ", id, " well done.")
def main():# 讀取數(shù)據(jù)集dataFile = 'data/text.csv'data = pd.read_csv(dataFile)# 停用詞表stopkey = [w.strip() for w in codecs.open('data/stopWord.txt', 'r', encoding='utf-8').readlines()]# 詞向量模型inp = 'wiki.zh.text.vector'model = gensim.models.KeyedVectors.load_word2vec_format(inp, binary=False)build_words_vecs(data, stopkey, model)
if __name__ == '__main__':main()
基于word2vec的關(guān)鍵詞提取
# coding=utf-8
import os
from sklearn.cluster import KMeans
import pandas as pd
import numpy as np
import math
# 對詞向量采用K-means聚類抽取TopK關(guān)鍵詞
def words_kmeans(data, topK):words = data["word"] # 詞匯vecs = data.iloc[:, 1:] # 向量表示kmeans = KMeans(n_clusters=1, random_state=10).fit(vecs)labels = kmeans.labels_ # 類別結(jié)果標簽labels = pd.DataFrame(labels, columns=['label'])new_df = pd.concat([labels, vecs], axis=1)vec_center = kmeans.cluster_centers_ # 聚類中心# 計算距離(相似性) 采用歐幾里得距離(歐式距離)distances = []vec_words = np.array(vecs) # 候選關(guān)鍵詞向量,dataFrame轉(zhuǎn)arrayvec_center = vec_center[0] # 第一個類別聚類中心,本例只有一個類別length = len(vec_center) # 向量維度for index in range(len(vec_words)): # 候選關(guān)鍵詞個數(shù)cur_wordvec = vec_words[index] # 當前詞語的詞向量dis = 0 # 向量距離for index2 in range(length):dis += (vec_center[index2] - cur_wordvec[index2]) * \(vec_center[index2] - cur_wordvec[index2])dis = math.sqrt(dis)distances.append(dis)distances = pd.DataFrame(distances, columns=['dis'])# 拼接詞語與其對應(yīng)中心點的距離result = pd.concat([words, labels, distances], axis=1)# 按照距離大小進行升序排序result = result.sort_values(by="dis", ascending=True)# 抽取排名前topK個詞語作為文本關(guān)鍵詞wordlist = np.array(result['word'])# 抽取前topK個詞匯word_split = [wordlist[x] for x in range(0, topK)]word_split = " ".join(word_split)return word_split
if __name__ == '__main__':# 讀取數(shù)據(jù)集dataFile = 'data/text.csv'articleData = pd.read_csv(dataFile)ids, titles, keys = [], [], []rootdir = "result/vecs" # 詞向量文件根目錄fileList = os.listdir(rootdir) # 列出文件夾下所有的目錄與文件# 遍歷文件for i in range(len(fileList)):filename = fileList[i]path = os.path.join(rootdir, filename)if os.path.isfile(path):# 讀取詞向量文件數(shù)據(jù)data = pd.read_csv(path, encoding='utf-8')# 聚類算法得到當前文件的關(guān)鍵詞artile_keys = words_kmeans(data, 5)# 根據(jù)文件名獲得文章id以及標題(shortname, extension) = os.path.splitext(filename)t = shortname.split("_")article_id = int(t[len(t) - 1]) # 獲得文章id# 獲得文章標題artile_tit = articleData[articleData.id == article_id]['title']print(artile_tit)print(list(artile_tit))artile_tit = list(artile_tit)[0] # series轉(zhuǎn)成字符串ids.append(article_id)titles.append(artile_tit)keys.append(artile_keys.encode("utf-8").decode("utf-8"))# 所有結(jié)果寫入文件result = pd.DataFrame({"id": ids, "title": titles, "key": keys}, columns=['id', 'title', 'key'])result = result.sort_values(by="id", ascending=True) # 排序result.to_csv("result/word2vec.csv", index=False, encoding='utf_8_sig')