语义特征#
# 如果只需要提取一部分特征,可以选择性地导入以下工具包
import os
import json
# 数据处理及可视化
import numpy as np
import matplotlib
matplotlib.rc("font", family='SimHei') # 用来显示中文,对于macos系统需要换一个支持的字体
# 自然语言处理
from srilm import LM
import hanlp
import torch
import torch.nn.functional as F
from transformers import (
BertTokenizer,
GPT2LMHeadModel,
TextGenerationPipeline,
AutoTokenizer,
AutoModelForSequenceClassification,
AutoModelForSeq2SeqLM,
pipeline
)
1 数据预处理:加载语料库以及进行分词#
def filter_str(astr, tokenizer):
'''
# 使用分词模型来分词
输入:
astr: str, a sentence
tokenizer: hanlp tokenizer
输出:
a sentence with words separated by space
'''
words = tokenizer(astr)
return ' '.join(words)
def prepare_corpus(tokenizer, corpus, save_json_name):
'''
# 对语料库进行分词
输入:
tokenizer: hanlp tokenizer
corpus: str, the path of corpus
save_json_name: str, the path of saving json file
输出:
'''
with open(save_json_name, 'r', encoding='utf-8') as fp:
wiki_texts = json.load(fp)
wiki_texts_new = []
for line in wiki_texts:
wiki_texts_new.append(filter_str(line, tokenizer))
open(corpus, 'w').write('\n'.join(wiki_texts_new))
# 加载hanlp中的分词模型
hanlp_tok = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH)
# wiki语料
wiki_file = './srilm_data_model/wiki_demo/wiki_z.json'
# 分词后语料文件
wiki_file_tkd = './srilm_data_model/wiki_demo/wiki_z_word.txt'
# 执行
prepare_corpus(hanlp_tok, wiki_file_tkd, wiki_file)
100%|██████████| 1/1 [00:00<00:00, 2.23it/s]
2 基于语料库统计的N-gram计算#
2.1 从语料库中生成N-gram模型#
将语料库(corpus)和指定的模型设置(ngram)输入模型,在模型存储路径(model_path)中输出统计好的模型
现成的N-gram语料库:google n-gram
def generate_model(model_path, ngram, corpus):
'''
输入:
model_path: str, ngram模型的保存路径
ngram: str, ngram-count路径
corpus: str, corpus路径
输出:
'''
cmd = '{} -text {} -order 3 -kndiscount3 -lm {}'.format(ngram, corpus, model_path)
os.system(cmd)
ngram = '/home/zhang/acoustic_theory/workspace/21-12-30-srilm/srilm/bin/i686-m64/ngram-count'
wiki_file_tkd = './srilm_data_model/wiki_demo/wiki_z_word.txt'
model_path = './srilm_data_model/wiki_demo/wiki_z_word.lm'
generate_model(model_path, ngram, wiki_file_tkd)
warning: discount coeff 1 is out of range: 0
warning: discount coeff 7 is out of range: 1.91919
model_path = './srilm_data_model/wiki/wiki_z_word.lm'
lm = LM(model_path, lower=True) # 加载N-gram模型
2.2 采用N-gram模型计算词频#
用srilm的LM来调用刚刚生成的模型,采用lm.logprob_strings(word, context)来生成 \(\log{p \left( \rm{word} | context \right)}\),word是当前单词,当context是空列表[]时相当于1-gram即词频
# 计算词频
print('*'*20 + ' 计算词频 ' + '*'*20)
word_freq0_ = lm.logprob_strings('的', [])
word_freq1_ = lm.logprob_strings('西瓜', [])
word_freq2_ = lm.logprob_strings('桌子', [])
# 输出结果
print('='*20 + 'P(的) vs P(西瓜) vs P(桌子)' + '='*20)
print('P(的): ' + str(word_freq0_))
print('P(西瓜): ' + str(word_freq1_))
print('P(桌子): ' + str(word_freq2_))
******************** 计算词频 ********************
====================P(的) vs P(西瓜) vs P(桌子)====================
P(的): -1.3277089595794678
P(西瓜): -5.5793938636779785
P(桌子): -5.5162577629089355
2.3 采用N-gram模型计算转移概率#
当\(n>1\)时,在context中放入前\(n-1\)个词,顺序是从右到左。
tp1_ = lm.logprob_strings('西瓜', ['吃', '喜欢'])
tp2_ = lm.logprob_strings('桌子', ['吃', '喜欢'])
print('='*10 + 'P(西瓜 | 吃, 喜欢) vs P(桌子 | 吃, 喜欢)' + '='*10)
print('P(西瓜 | 吃, 喜欢): ' + str(tp1_))
print('P(桌子 | 吃, 喜欢): ' + str(tp2_))
==========P(西瓜 | 吃, 喜欢) vs P(桌子 | 吃, 喜欢)==========
P(西瓜 | 吃, 喜欢): -2.884925365447998
P(桌子 | 吃, 喜欢): -6.211382865905762
2.4 采用N-gram模型计算surprisal#
\(\rm{surprisal} = -\log{ \it{p} \left( \rm{word} | context \right)}\),所以只要取负即可。
s1_ = -lm.logprob_strings('西瓜', ['吃', '喜欢'])
s2_ = -lm.logprob_strings('桌子', ['吃', '喜欢'])
print('='*10 + 'surprisal(西瓜 | 吃, 喜欢) vs surprisal(桌子 | 吃, 喜欢)' + '='*10)
print('surprisal(西瓜 | 吃, 喜欢): ' + str(s1_))
print('surprisal(桌子 | 吃, 喜欢): ' + str(s2_))
==========surprisal(西瓜 | 吃, 喜欢) vs surprisal(桌子 | 吃, 喜欢)==========
surprisal(西瓜 | 吃, 喜欢): 2.884925365447998
surprisal(桌子 | 吃, 喜欢): 6.211382865905762
2.5 采用N-gram模型计算entropy#
\(\rm{entropy} = \sum \left( p*surprisal \right)\),所以对于给定的context,对所有的词来计算surprisal然后求期望
model_path = './srilm_data_model/wiki/wiki_z_morpheme.lm'
lm = LM(model_path, lower=True) # 加载N-gram模型
def entropy_cal(lm, context):
# entropy
raw_text_idx = [lm.vocab.intern(w) for w in context]
vocab_num = lm.vocab.max_interned() + 1
logprobs = [lm.logprob(i, raw_text_idx) for i in range(vocab_num)]
logprobs_np = np.array(logprobs)
logprobs_np_ = logprobs_np[logprobs_np > -np.inf]
entropy_ = sum(-np.power(10, logprobs_np_)*logprobs_np_)
return entropy_
print('='*10 + 'entropy(蝴) vs entropy(。)' + '='*10)
e1_ = entropy_cal(lm, ['蝴'])
print('entropy(蝴): ' + str(e1_))
e2_ = entropy_cal(lm, ['。'])
print('entropy(。): ' + str(e2_))
==========entropy(蝴) vs entropy(。)==========
entropy(蝴): 0.03182660213747036
entropy(。): 2.5136258206385347
3 基于深度学习模型的转移概率计算#
以gpt-2为例,采用的模型为gpt2-chinese-cluecorpussmall
3.1 加载模型,包括分词模型与语言模型#
from transformers import BertTokenizer, GPT2LMHeadModel
ckpt_path = "uer/gpt2-chinese-cluecorpussmall" # checkpoint模型路径
tokenizer = BertTokenizer.from_pretrained(ckpt_path) # 分词器
model = GPT2LMHeadModel.from_pretrained(ckpt_path) # 语言模型
3.2 获取模型的转移概率#
model.config.output_hidden_states = True # 在模型设置config中设置为True,可以让模型输出hidden states
inputs = tokenizer('蝴蝶飞舞。绵羊吃草。', return_tensors="pt") # 对句子进行分词
tks = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
print('='*10 + '输入tokens: ' + '='*10)
print(tks)
outputs = model(**inputs) # 将分词后的句子输入模型,得到模型输出的结果
probs = outputs.logits[0]
print('='*10 + '转移概率维度: ' + '='*10)
print(str(probs.shape) + ' 输入字数 x 总字数')
==========输入tokens: ==========
['[CLS]', '蝴', '蝶', '飞', '舞', '。', '绵', '羊', '吃', '草', '。', '[SEP]']
==========转移概率维度: ==========
torch.Size([12, 21128]) 输入字数 x 总字数
3.3 获取模型的surprisal与entropy#
probs_sfm = F.softmax(probs, dim=-1)
input_ids_ = inputs['input_ids'][0]
prob_target = [probs_sfm[idx, in_id_tmp].item() for idx, in_id_tmp in enumerate(input_ids_[1:])]
gpt_surprisal = -np.log10(prob_target)
gpt_entropy = -(torch.log10(probs_sfm) * probs_sfm).nansum(dim=-1)
tks = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
for idx in range(len(tks)-1):
print('\n==========================================')
print(f'previous tokens: {tks[:idx+1]}')
prob_sort_idx = probs_sfm[idx, :].argsort(descending=True)
pred_tks = tokenizer.convert_ids_to_tokens(prob_sort_idx)
print(f'tokens (top k): {pred_tks[:10]}')
# print(f'prob of tokens (top k): {probs_sfm[idx, prob_sort_idx[:10]]}')
print(f'surprisal of {tks[idx+1]}: {gpt_surprisal[idx]}; entropy: {gpt_entropy[idx]}; ')
==========================================
previous tokens: ['[CLS]']
tokens (top k): ['如', '很', '这', '有', '不', '书', '你', '为', '我', '一']
surprisal of 蝴: 4.849288742842096; entropy: 2.7137670516967773;
==========================================
previous tokens: ['[CLS]', '蝴']
tokens (top k): ['蝶', '蜓', '[UNK]', '蛹', '-', '蝴', '蜢', '##ser', '##e', '~']
surprisal of 蝶: 0.00013581902527431; entropy: 0.002098201308399439;
==========================================
previous tokens: ['[CLS]', '蝴', '蝶']
tokens (top k): ['蝶', '(', '变', '的', '是', '酥', '属', '超', '结', '飞']
surprisal of 飞: 1.875799630285334; entropy: 2.705556869506836;
==========================================
previous tokens: ['[CLS]', '蝴', '蝶', '飞']
tokens (top k): ['机', '行', '蝶', '蛾', '舞', '翔', '碟', '鸟', '鱼', '龙']
surprisal of 舞: 1.3007874730402909; entropy: 1.6138794422149658;
==========================================
previous tokens: ['[CLS]', '蝴', '蝶', '飞', '舞']
tokens (top k): ['蝴', '蝶', '的', '是', '(', '(', ',', '-', '[SEP]', '《']
surprisal of 。: 2.868855494287529; entropy: 2.5130739212036133;
==========================================
previous tokens: ['[CLS]', '蝴', '蝶', '飞', '舞', '。']
tokens (top k): ['蝴', '[SEP]', '蝶', '这', '一', '是', '。', '我', '不', '飞']
surprisal of 绵: 4.865470369658404; entropy: 2.337616443634033;
==========================================
previous tokens: ['[CLS]', '蝴', '蝶', '飞', '舞', '。', '绵']
tokens (top k): ['绵', '羊', '长', '密', '綿', '阳', '[UNK]', '延', '软', '柔']
surprisal of 羊: 1.1589601338270719; entropy: 0.8466951251029968;
==========================================
previous tokens: ['[CLS]', '蝴', '蝶', '飞', '舞', '。', '绵', '羊']
tokens (top k): ['羔', '毛', '脂', '蝎', '角', ',', '驼', '绒', '羊', '年']
surprisal of 吃: 4.274023174230079; entropy: 2.755201816558838;
==========================================
previous tokens: ['[CLS]', '蝴', '蝶', '飞', '舞', '。', '绵', '羊', '吃']
tokens (top k): ['了', '羊', '的', '。', '草', '食', '起', '完', ',', '饭']
surprisal of 草: 1.705819622305031; entropy: 2.481321334838867;
==========================================
previous tokens: ['[CLS]', '蝴', '蝶', '飞', '舞', '。', '绵', '羊', '吃', '草']
tokens (top k): ['。', '莓', ',', '原', '的', '蜢', '(', '地', '草', '[SEP]']
surprisal of 。: 0.7054180647842639; entropy: 1.960336446762085;
==========================================
previous tokens: ['[CLS]', '蝴', '蝶', '飞', '舞', '。', '绵', '羊', '吃', '草', '。']
tokens (top k): ['[SEP]', '绵', '这', '我', '羊', '。', '蝴', '一', '小', '不']
surprisal of [SEP]: 0.6934596181641993; entropy: 2.7353546619415283;
4 词性#
## 0. 分词
sent_ex = '这个门被锁了,锁很难被打开。'
tok = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH)
tks = tok(sent_ex)
print('0. 分词结果:')
print(tks)
## 1. 词性标注
pos = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL)
print('1. 词性标注:')
print(pos(tks))
Building model ...
0. 分词结果:
['这个', '门', '被', '锁', '了', ',', '锁', '很难', '被', '打开', '。']
1. 词性标注:
['DT', 'NN', 'SB', 'VV', 'SP', 'PU', 'VV', 'AD', 'SB', 'VV', 'PU']
5 词向量#
5.1 获取静态词向量:以word2vec为例#
word2vec = hanlp.load(hanlp.pretrained.word2vec.MERGE_SGNS_BIGRAM_CHAR_300_ZH) # 加载word2vec词向量
word2vec('中国')
tensor([ 1.4234e-02, 8.3600e-02, 2.4145e-02, -1.0256e-01, -1.0829e-01,
-2.6786e-02, -9.6481e-02, 9.0537e-02, -5.4941e-02, 4.5936e-02,
-4.2577e-02, -5.1776e-02, 4.9661e-02, -3.2703e-02, -6.6407e-03,
9.8313e-03, 4.2377e-02, -7.1969e-02, 6.7363e-02, -1.2679e-01,
1.3423e-03, 1.8129e-02, 1.3923e-02, 6.0298e-02, 2.9974e-02,
3.4969e-02, 4.7053e-02, -1.4874e-02, 6.6235e-02, -1.5579e-01,
-1.1716e-01, 8.8726e-02, 6.0976e-02, -8.0692e-02, -3.1017e-02,
-1.3132e-02, 5.4841e-02, 4.0733e-02, -1.5295e-01, -7.8516e-02,
6.6119e-02, 2.9393e-02, -3.0162e-02, -4.3704e-02, 8.3047e-03,
-7.7654e-02, -1.5644e-02, 6.2678e-02, 7.3149e-02, -1.9128e-02,
2.7543e-02, -1.4893e-02, -1.2223e-02, 9.6474e-02, 2.1985e-02,
4.4640e-02, -2.4626e-02, 9.8536e-02, -1.3777e-01, 5.1621e-02,
9.5042e-02, -3.2784e-02, 2.8697e-02, -1.3267e-02, 1.1536e-02,
-9.0047e-02, -7.2654e-02, -8.7082e-04, -3.6991e-02, 1.6448e-03,
2.6809e-02, -7.5198e-02, -2.6094e-02, 6.5516e-03, -7.2922e-02,
-6.3720e-02, -6.4798e-03, 1.3006e-02, 1.7040e-02, -4.3527e-02,
1.6448e-03, -4.0217e-02, 2.1293e-02, -4.1442e-02, -4.9964e-02,
1.0784e-02, 1.2986e-01, -1.7174e-02, 9.0332e-02, 8.1890e-04,
-4.3150e-02, -6.7029e-02, -4.6127e-02, -6.4486e-02, -1.8022e-02,
1.3425e-02, 6.9962e-02, -1.4400e-02, 6.0225e-03, -3.7480e-03,
8.5195e-03, -2.2870e-02, -4.1049e-02, -1.8603e-02, -5.3075e-02,
-7.1510e-02, 9.2589e-03, -6.3029e-03, -2.4524e-02, -3.4340e-02,
-8.8730e-02, 1.5332e-02, 2.8820e-02, 1.8295e-02, -5.8320e-02,
-2.7167e-02, -1.7402e-02, -7.7428e-02, -1.0769e-01, -1.0446e-01,
4.5363e-02, -6.3230e-02, 8.3784e-02, 5.3965e-02, 2.0121e-02,
-3.7716e-02, -2.0752e-02, -6.2321e-02, -1.3778e-01, 5.0385e-02,
8.9087e-06, -8.1429e-02, 6.1611e-02, -4.1132e-02, 7.4521e-02,
-5.0390e-02, -1.6549e-02, 4.1053e-02, -1.7056e-02, -1.2268e-02,
-1.3683e-02, 1.0725e-02, -5.9534e-02, -3.3246e-02, 3.8279e-02,
-3.6564e-02, 6.8516e-02, 6.6845e-02, 4.3522e-02, -2.3375e-02,
-1.3111e-02, 1.4433e-03, 3.9912e-02, 3.8543e-03, 8.9713e-02,
1.9988e-02, 9.5058e-04, -7.2403e-02, -3.7107e-02, -6.4932e-02,
-2.1959e-02, 3.4034e-02, -2.9596e-02, -6.8593e-02, -1.9584e-02,
4.0717e-02, -1.0285e-01, -6.5889e-03, 9.2453e-03, -4.2289e-02,
-5.7992e-02, 3.3845e-02, 1.3048e-02, -5.1361e-02, 7.8392e-02,
-1.9344e-02, -1.0448e-01, 4.1529e-02, -9.7657e-02, -3.4509e-03,
4.9083e-02, 5.5863e-02, 8.7877e-03, -1.1969e-01, 7.1582e-02,
2.4624e-02, -2.8234e-03, -1.0275e-01, -8.0798e-02, -1.2945e-01,
1.7228e-02, -8.7083e-02, -4.5541e-02, -3.6977e-02, 7.5634e-02,
6.3264e-02, -1.0102e-01, -9.6761e-02, -1.7960e-02, -1.6474e-02,
6.5089e-02, -5.6679e-02, 1.7903e-02, -6.3342e-02, 2.1894e-02,
-8.5694e-03, -2.0418e-02, 9.6943e-02, 6.6336e-02, 5.3024e-02,
7.7205e-02, 7.5687e-02, -2.4854e-02, -8.4196e-02, 7.2153e-02,
-3.3994e-02, 2.7743e-02, 7.6132e-02, 1.2271e-01, 8.2420e-02,
2.2781e-02, 6.0472e-03, -1.5400e-01, -1.1090e-01, -1.8680e-03,
9.7762e-02, 3.7373e-03, -2.6415e-02, 1.7530e-02, 9.8943e-03,
-4.3207e-02, 4.6805e-02, 1.3863e-02, -5.2318e-02, -3.4550e-03,
-3.7918e-02, 2.9433e-02, 3.3142e-02, 8.7807e-03, 3.0049e-02,
8.8094e-02, 1.4916e-03, -1.7431e-02, -2.5317e-02, -1.6277e-02,
1.1268e-02, 9.4293e-02, 3.3744e-02, -3.4135e-02, 6.1734e-04,
-5.8349e-02, 1.2800e-01, 2.4264e-03, -1.0573e-01, -2.0444e-02,
3.9112e-02, -1.4461e-01, 6.4038e-02, -8.3256e-03, -4.6320e-02,
-1.3400e-02, 1.2040e-02, 7.3522e-02, -1.6663e-02, -1.2628e-03,
-2.7094e-02, -1.8414e-03, 6.0205e-02, -6.7361e-02, 5.6380e-02,
2.3484e-03, -4.5203e-03, 4.1993e-02, 2.9977e-02, -1.2228e-02,
2.8904e-03, -1.7870e-02, -1.3307e-02, -4.5424e-02, -3.1245e-02,
4.0651e-03, 1.0091e-01, 6.3333e-02, 1.5903e-01, 9.9152e-02,
-2.0661e-02, 6.4784e-03, 1.3163e-03, 2.6181e-02, -9.9187e-03,
1.4386e-02, -4.5888e-02, 5.6548e-02, 3.5045e-02, 5.5262e-02,
3.0622e-02, 9.1758e-03, -1.0747e-01, 5.5859e-03, -5.0639e-02],
device='cuda:0')
捕获了性别信息
捕获了首都信息
print(torch.nn.functional.cosine_similarity(
word2vec('国王')-word2vec('王妃'),
word2vec('男')-word2vec('女'), dim=0)
)
print(torch.nn.functional.cosine_similarity(
word2vec('公主')-word2vec('王妃'),
word2vec('男')-word2vec('女'), dim=0)
)
tensor(0.1429, device='cuda:0')
tensor(0.0366, device='cuda:0')
print(torch.nn.functional.cosine_similarity(
word2vec('日本')-word2vec('东京'),
word2vec('中国')-word2vec('北京'), dim=0)
)
print(torch.nn.functional.cosine_similarity(
word2vec('韩国')-word2vec('东京'),
word2vec('中国')-word2vec('北京'), dim=0)
)
tensor(0.4674, device='cuda:0')
tensor(0.3933, device='cuda:0')
计算相似词
# 单个词
print(word2vec.most_similar('北京'))
print('\n')
{'上海': 0.6443496942520142, '天津': 0.6384099721908569, '西安': 0.611718475818634, '南京': 0.6113559603691101, '北京市': 0.6093109846115112, '海淀': 0.6049214601516724, '广州': 0.5977935791015625, '京城': 0.5955069661140442, '沈阳': 0.5865166187286377, '深圳': 0.580772876739502}
5.2 获取基于上下文的词向量:语言模型的隐藏层表征#
同样以1.3中调用的gpt2-chinese-cluecorpussmall为例
from transformers import BertTokenizer, GPT2LMHeadModel, TextGenerationPipeline
ckpt_path = "uer/gpt2-chinese-cluecorpussmall" # checkpoint模型路径
tokenizer = BertTokenizer.from_pretrained(ckpt_path) # 分词器
model = GPT2LMHeadModel.from_pretrained(ckpt_path) # 语言模型
model.config.output_hidden_states = True
inputs = tokenizer('小明喜欢吃西瓜。小明喜欢打篮球。小明经常去花店', return_tensors="pt")
outputs = model(**inputs)
print('\n' + '='*10 + '最后一层输出的内隐表征维度: ' + '='*10)
print(str(outputs.hidden_states[-1].shape) + ' 1 x 输入字数 x 表征维度')
==========最后一层输出的内隐表征维度: ==========
torch.Size([1, 25, 768]) 1 x 输入字数 x 表征维度