语义特征#

# 如果只需要提取一部分特征,可以选择性地导入以下工具包
import os
import json

# 数据处理及可视化
import numpy as np
import matplotlib
matplotlib.rc("font", family='SimHei') # 用来显示中文,对于macos系统需要换一个支持的字体

# 自然语言处理
from srilm import LM
import hanlp
import torch
import torch.nn.functional as F
from transformers import (
    BertTokenizer,
    GPT2LMHeadModel, 
    TextGenerationPipeline,
    AutoTokenizer, 
    AutoModelForSequenceClassification, 
    AutoModelForSeq2SeqLM,
    pipeline
    )

1 数据预处理:加载语料库以及进行分词#

def filter_str(astr, tokenizer):
    '''
    # 使用分词模型来分词
    输入: 
        astr: str, a sentence
        tokenizer: hanlp tokenizer
    输出:
        a sentence with words separated by space
    '''
    words = tokenizer(astr)
    return ' '.join(words)

def prepare_corpus(tokenizer, corpus, save_json_name):
    '''
    # 对语料库进行分词
    输入:
        tokenizer: hanlp tokenizer
        corpus: str, the path of corpus
        save_json_name: str, the path of saving json file
    输出: 
        
    '''
    with open(save_json_name, 'r', encoding='utf-8') as fp:
        wiki_texts = json.load(fp)
        wiki_texts_new = []
        for line in wiki_texts:
            wiki_texts_new.append(filter_str(line, tokenizer))
        open(corpus, 'w').write('\n'.join(wiki_texts_new))

# 加载hanlp中的分词模型
hanlp_tok = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH)
# wiki语料
wiki_file = './srilm_data_model/wiki_demo/wiki_z.json'
# 分词后语料文件
wiki_file_tkd = './srilm_data_model/wiki_demo/wiki_z_word.txt'
# 执行
prepare_corpus(hanlp_tok, wiki_file_tkd, wiki_file)
100%|██████████| 1/1 [00:00<00:00,  2.23it/s]

2 基于语料库统计的N-gram计算#

2.1 从语料库中生成N-gram模型#

  • 将语料库(corpus)和指定的模型设置(ngram)输入模型,在模型存储路径(model_path)中输出统计好的模型

  • 现成的N-gram语料库:google n-gram

def generate_model(model_path, ngram, corpus):
    '''
    输入:
        model_path: str, ngram模型的保存路径
        ngram: str, ngram-count路径
        corpus: str, corpus路径
    输出:
        
    '''
    cmd = '{} -text {} -order 3 -kndiscount3 -lm {}'.format(ngram, corpus, model_path)
    os.system(cmd)

ngram = '/home/zhang/acoustic_theory/workspace/21-12-30-srilm/srilm/bin/i686-m64/ngram-count'
wiki_file_tkd = './srilm_data_model/wiki_demo/wiki_z_word.txt'
model_path = './srilm_data_model/wiki_demo/wiki_z_word.lm'
generate_model(model_path, ngram, wiki_file_tkd)
warning: discount coeff 1 is out of range: 0
warning: discount coeff 7 is out of range: 1.91919
model_path = './srilm_data_model/wiki/wiki_z_word.lm'
lm = LM(model_path, lower=True) # 加载N-gram模型

2.2 采用N-gram模型计算词频#

用srilm的LM来调用刚刚生成的模型,采用lm.logprob_strings(word, context)来生成 \(\log{p \left( \rm{word} | context \right)}\),word是当前单词,当context是空列表[]时相当于1-gram即词频

# 计算词频
print('*'*20 + ' 计算词频 ' + '*'*20)
word_freq0_ = lm.logprob_strings('的', [])
word_freq1_ = lm.logprob_strings('西瓜', [])
word_freq2_ = lm.logprob_strings('桌子', [])

# 输出结果
print('='*20 + 'P(的) vs P(西瓜) vs P(桌子)' + '='*20)
print('P(的): ' + str(word_freq0_))
print('P(西瓜): ' + str(word_freq1_))
print('P(桌子): ' + str(word_freq2_))
******************** 计算词频 ********************
====================P(的) vs P(西瓜) vs P(桌子)====================
P(的): -1.3277089595794678
P(西瓜): -5.5793938636779785
P(桌子): -5.5162577629089355

2.3 采用N-gram模型计算转移概率#

\(n>1\)时,在context中放入前\(n-1\)个词,顺序是从右到左。

tp1_ = lm.logprob_strings('西瓜', ['吃', '喜欢'])
tp2_ = lm.logprob_strings('桌子', ['吃', '喜欢'])
print('='*10 + 'P(西瓜 | 吃, 喜欢) vs P(桌子 | 吃, 喜欢)' + '='*10)
print('P(西瓜 | 吃, 喜欢): ' + str(tp1_))
print('P(桌子 | 吃, 喜欢): ' + str(tp2_))
==========P(西瓜 | 吃, 喜欢) vs P(桌子 | 吃, 喜欢)==========
P(西瓜 | 吃, 喜欢): -2.884925365447998
P(桌子 | 吃, 喜欢): -6.211382865905762

2.4 采用N-gram模型计算surprisal#

\(\rm{surprisal} = -\log{ \it{p} \left( \rm{word} | context \right)}\),所以只要取负即可。

s1_ = -lm.logprob_strings('西瓜', ['吃', '喜欢'])
s2_ = -lm.logprob_strings('桌子', ['吃', '喜欢'])
print('='*10 + 'surprisal(西瓜 | 吃, 喜欢) vs surprisal(桌子 | 吃, 喜欢)' + '='*10)
print('surprisal(西瓜 | 吃, 喜欢): ' + str(s1_))
print('surprisal(桌子 | 吃, 喜欢): ' + str(s2_))
==========surprisal(西瓜 | 吃, 喜欢) vs surprisal(桌子 | 吃, 喜欢)==========
surprisal(西瓜 | 吃, 喜欢): 2.884925365447998
surprisal(桌子 | 吃, 喜欢): 6.211382865905762

2.5 采用N-gram模型计算entropy#

\(\rm{entropy} = \sum \left( p*surprisal \right)\),所以对于给定的context,对所有的词来计算surprisal然后求期望

model_path = './srilm_data_model/wiki/wiki_z_morpheme.lm'
lm = LM(model_path, lower=True) # 加载N-gram模型
def entropy_cal(lm, context):
    # entropy
    raw_text_idx = [lm.vocab.intern(w) for w in context]
    vocab_num = lm.vocab.max_interned() + 1
    logprobs = [lm.logprob(i, raw_text_idx) for i in range(vocab_num)]
    logprobs_np = np.array(logprobs)
    logprobs_np_ = logprobs_np[logprobs_np > -np.inf]
    entropy_ = sum(-np.power(10, logprobs_np_)*logprobs_np_)
    return entropy_

print('='*10 + 'entropy(蝴) vs entropy(。)' + '='*10)
e1_ = entropy_cal(lm, ['蝴'])
print('entropy(蝴): ' + str(e1_))
e2_ = entropy_cal(lm, ['。'])
print('entropy(。): ' + str(e2_))
==========entropy(蝴) vs entropy(。)==========
entropy(蝴): 0.03182660213747036
entropy(。): 2.5136258206385347

3 基于深度学习模型的转移概率计算#

以gpt-2为例,采用的模型为gpt2-chinese-cluecorpussmall

3.1 加载模型,包括分词模型与语言模型#

from transformers import BertTokenizer, GPT2LMHeadModel
ckpt_path = "uer/gpt2-chinese-cluecorpussmall" # checkpoint模型路径
tokenizer = BertTokenizer.from_pretrained(ckpt_path) # 分词器
model = GPT2LMHeadModel.from_pretrained(ckpt_path) # 语言模型

3.2 获取模型的转移概率#

model.config.output_hidden_states = True  # 在模型设置config中设置为True,可以让模型输出hidden states
inputs = tokenizer('蝴蝶飞舞。绵羊吃草。', return_tensors="pt") # 对句子进行分词

tks = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
print('='*10 + '输入tokens: ' + '='*10)
print(tks)

outputs = model(**inputs)  # 将分词后的句子输入模型,得到模型输出的结果
probs = outputs.logits[0]
print('='*10 + '转移概率维度: ' + '='*10)
print(str(probs.shape) + '  输入字数 x 总字数')
==========输入tokens: ==========
['[CLS]', '蝴', '蝶', '飞', '舞', '。', '绵', '羊', '吃', '草', '。', '[SEP]']
==========转移概率维度: ==========
torch.Size([12, 21128])  输入字数 x 总字数

3.3 获取模型的surprisal与entropy#

probs_sfm = F.softmax(probs, dim=-1)
input_ids_ = inputs['input_ids'][0]
prob_target = [probs_sfm[idx, in_id_tmp].item() for idx, in_id_tmp in enumerate(input_ids_[1:])]

gpt_surprisal = -np.log10(prob_target)
gpt_entropy = -(torch.log10(probs_sfm) * probs_sfm).nansum(dim=-1)
tks = tokenizer.convert_ids_to_tokens(inputs['input_ids'][0])
for idx in range(len(tks)-1):
    print('\n==========================================')
    print(f'previous tokens: {tks[:idx+1]}')

    prob_sort_idx = probs_sfm[idx, :].argsort(descending=True)
    pred_tks = tokenizer.convert_ids_to_tokens(prob_sort_idx)
    print(f'tokens (top k): {pred_tks[:10]}')
    # print(f'prob of tokens (top k): {probs_sfm[idx, prob_sort_idx[:10]]}')

    print(f'surprisal of {tks[idx+1]}: {gpt_surprisal[idx]}; entropy: {gpt_entropy[idx]}; ')
==========================================
previous tokens: ['[CLS]']
tokens (top k): ['如', '很', '这', '有', '不', '书', '你', '为', '我', '一']
surprisal of 蝴: 4.849288742842096; entropy: 2.7137670516967773; 

==========================================
previous tokens: ['[CLS]', '蝴']
tokens (top k): ['蝶', '蜓', '[UNK]', '蛹', '-', '蝴', '蜢', '##ser', '##e', '~']
surprisal of 蝶: 0.00013581902527431; entropy: 0.002098201308399439; 

==========================================
previous tokens: ['[CLS]', '蝴', '蝶']
tokens (top k): ['蝶', '(', '变', '的', '是', '酥', '属', '超', '结', '飞']
surprisal of 飞: 1.875799630285334; entropy: 2.705556869506836; 

==========================================
previous tokens: ['[CLS]', '蝴', '蝶', '飞']
tokens (top k): ['机', '行', '蝶', '蛾', '舞', '翔', '碟', '鸟', '鱼', '龙']
surprisal of 舞: 1.3007874730402909; entropy: 1.6138794422149658; 

==========================================
previous tokens: ['[CLS]', '蝴', '蝶', '飞', '舞']
tokens (top k): ['蝴', '蝶', '的', '是', '(', '(', ',', '-', '[SEP]', '《']
surprisal of 。: 2.868855494287529; entropy: 2.5130739212036133; 

==========================================
previous tokens: ['[CLS]', '蝴', '蝶', '飞', '舞', '。']
tokens (top k): ['蝴', '[SEP]', '蝶', '这', '一', '是', '。', '我', '不', '飞']
surprisal of 绵: 4.865470369658404; entropy: 2.337616443634033; 

==========================================
previous tokens: ['[CLS]', '蝴', '蝶', '飞', '舞', '。', '绵']
tokens (top k): ['绵', '羊', '长', '密', '綿', '阳', '[UNK]', '延', '软', '柔']
surprisal of 羊: 1.1589601338270719; entropy: 0.8466951251029968; 

==========================================
previous tokens: ['[CLS]', '蝴', '蝶', '飞', '舞', '。', '绵', '羊']
tokens (top k): ['羔', '毛', '脂', '蝎', '角', ',', '驼', '绒', '羊', '年']
surprisal of 吃: 4.274023174230079; entropy: 2.755201816558838; 

==========================================
previous tokens: ['[CLS]', '蝴', '蝶', '飞', '舞', '。', '绵', '羊', '吃']
tokens (top k): ['了', '羊', '的', '。', '草', '食', '起', '完', ',', '饭']
surprisal of 草: 1.705819622305031; entropy: 2.481321334838867; 

==========================================
previous tokens: ['[CLS]', '蝴', '蝶', '飞', '舞', '。', '绵', '羊', '吃', '草']
tokens (top k): ['。', '莓', ',', '原', '的', '蜢', '(', '地', '草', '[SEP]']
surprisal of 。: 0.7054180647842639; entropy: 1.960336446762085; 

==========================================
previous tokens: ['[CLS]', '蝴', '蝶', '飞', '舞', '。', '绵', '羊', '吃', '草', '。']
tokens (top k): ['[SEP]', '绵', '这', '我', '羊', '。', '蝴', '一', '小', '不']
surprisal of [SEP]: 0.6934596181641993; entropy: 2.7353546619415283; 

4 词性#

## 0. 分词
sent_ex = '这个门被锁了,锁很难被打开。'
tok = hanlp.load(hanlp.pretrained.tok.COARSE_ELECTRA_SMALL_ZH)
tks = tok(sent_ex)
print('0. 分词结果:')
print(tks)

## 1. 词性标注
pos = hanlp.load(hanlp.pretrained.pos.CTB9_POS_ELECTRA_SMALL)
print('1. 词性标注:')
print(pos(tks))
Building model ...          
0. 分词结果:
['这个', '门', '被', '锁', '了', ',', '锁', '很难', '被', '打开', '。']
                                             
1. 词性标注:
['DT', 'NN', 'SB', 'VV', 'SP', 'PU', 'VV', 'AD', 'SB', 'VV', 'PU']

5 词向量#

5.1 获取静态词向量:以word2vec为例#

  • hanlp支持调用各种静态词向量, 包括word2vec, glove等等,具体的模型及文献可以在链接文档中进行选择,一般情况下维度越高越准确。

word2vec = hanlp.load(hanlp.pretrained.word2vec.MERGE_SGNS_BIGRAM_CHAR_300_ZH) # 加载word2vec词向量
word2vec('中国')
                                                        
tensor([ 1.4234e-02,  8.3600e-02,  2.4145e-02, -1.0256e-01, -1.0829e-01,
        -2.6786e-02, -9.6481e-02,  9.0537e-02, -5.4941e-02,  4.5936e-02,
        -4.2577e-02, -5.1776e-02,  4.9661e-02, -3.2703e-02, -6.6407e-03,
         9.8313e-03,  4.2377e-02, -7.1969e-02,  6.7363e-02, -1.2679e-01,
         1.3423e-03,  1.8129e-02,  1.3923e-02,  6.0298e-02,  2.9974e-02,
         3.4969e-02,  4.7053e-02, -1.4874e-02,  6.6235e-02, -1.5579e-01,
        -1.1716e-01,  8.8726e-02,  6.0976e-02, -8.0692e-02, -3.1017e-02,
        -1.3132e-02,  5.4841e-02,  4.0733e-02, -1.5295e-01, -7.8516e-02,
         6.6119e-02,  2.9393e-02, -3.0162e-02, -4.3704e-02,  8.3047e-03,
        -7.7654e-02, -1.5644e-02,  6.2678e-02,  7.3149e-02, -1.9128e-02,
         2.7543e-02, -1.4893e-02, -1.2223e-02,  9.6474e-02,  2.1985e-02,
         4.4640e-02, -2.4626e-02,  9.8536e-02, -1.3777e-01,  5.1621e-02,
         9.5042e-02, -3.2784e-02,  2.8697e-02, -1.3267e-02,  1.1536e-02,
        -9.0047e-02, -7.2654e-02, -8.7082e-04, -3.6991e-02,  1.6448e-03,
         2.6809e-02, -7.5198e-02, -2.6094e-02,  6.5516e-03, -7.2922e-02,
        -6.3720e-02, -6.4798e-03,  1.3006e-02,  1.7040e-02, -4.3527e-02,
         1.6448e-03, -4.0217e-02,  2.1293e-02, -4.1442e-02, -4.9964e-02,
         1.0784e-02,  1.2986e-01, -1.7174e-02,  9.0332e-02,  8.1890e-04,
        -4.3150e-02, -6.7029e-02, -4.6127e-02, -6.4486e-02, -1.8022e-02,
         1.3425e-02,  6.9962e-02, -1.4400e-02,  6.0225e-03, -3.7480e-03,
         8.5195e-03, -2.2870e-02, -4.1049e-02, -1.8603e-02, -5.3075e-02,
        -7.1510e-02,  9.2589e-03, -6.3029e-03, -2.4524e-02, -3.4340e-02,
        -8.8730e-02,  1.5332e-02,  2.8820e-02,  1.8295e-02, -5.8320e-02,
        -2.7167e-02, -1.7402e-02, -7.7428e-02, -1.0769e-01, -1.0446e-01,
         4.5363e-02, -6.3230e-02,  8.3784e-02,  5.3965e-02,  2.0121e-02,
        -3.7716e-02, -2.0752e-02, -6.2321e-02, -1.3778e-01,  5.0385e-02,
         8.9087e-06, -8.1429e-02,  6.1611e-02, -4.1132e-02,  7.4521e-02,
        -5.0390e-02, -1.6549e-02,  4.1053e-02, -1.7056e-02, -1.2268e-02,
        -1.3683e-02,  1.0725e-02, -5.9534e-02, -3.3246e-02,  3.8279e-02,
        -3.6564e-02,  6.8516e-02,  6.6845e-02,  4.3522e-02, -2.3375e-02,
        -1.3111e-02,  1.4433e-03,  3.9912e-02,  3.8543e-03,  8.9713e-02,
         1.9988e-02,  9.5058e-04, -7.2403e-02, -3.7107e-02, -6.4932e-02,
        -2.1959e-02,  3.4034e-02, -2.9596e-02, -6.8593e-02, -1.9584e-02,
         4.0717e-02, -1.0285e-01, -6.5889e-03,  9.2453e-03, -4.2289e-02,
        -5.7992e-02,  3.3845e-02,  1.3048e-02, -5.1361e-02,  7.8392e-02,
        -1.9344e-02, -1.0448e-01,  4.1529e-02, -9.7657e-02, -3.4509e-03,
         4.9083e-02,  5.5863e-02,  8.7877e-03, -1.1969e-01,  7.1582e-02,
         2.4624e-02, -2.8234e-03, -1.0275e-01, -8.0798e-02, -1.2945e-01,
         1.7228e-02, -8.7083e-02, -4.5541e-02, -3.6977e-02,  7.5634e-02,
         6.3264e-02, -1.0102e-01, -9.6761e-02, -1.7960e-02, -1.6474e-02,
         6.5089e-02, -5.6679e-02,  1.7903e-02, -6.3342e-02,  2.1894e-02,
        -8.5694e-03, -2.0418e-02,  9.6943e-02,  6.6336e-02,  5.3024e-02,
         7.7205e-02,  7.5687e-02, -2.4854e-02, -8.4196e-02,  7.2153e-02,
        -3.3994e-02,  2.7743e-02,  7.6132e-02,  1.2271e-01,  8.2420e-02,
         2.2781e-02,  6.0472e-03, -1.5400e-01, -1.1090e-01, -1.8680e-03,
         9.7762e-02,  3.7373e-03, -2.6415e-02,  1.7530e-02,  9.8943e-03,
        -4.3207e-02,  4.6805e-02,  1.3863e-02, -5.2318e-02, -3.4550e-03,
        -3.7918e-02,  2.9433e-02,  3.3142e-02,  8.7807e-03,  3.0049e-02,
         8.8094e-02,  1.4916e-03, -1.7431e-02, -2.5317e-02, -1.6277e-02,
         1.1268e-02,  9.4293e-02,  3.3744e-02, -3.4135e-02,  6.1734e-04,
        -5.8349e-02,  1.2800e-01,  2.4264e-03, -1.0573e-01, -2.0444e-02,
         3.9112e-02, -1.4461e-01,  6.4038e-02, -8.3256e-03, -4.6320e-02,
        -1.3400e-02,  1.2040e-02,  7.3522e-02, -1.6663e-02, -1.2628e-03,
        -2.7094e-02, -1.8414e-03,  6.0205e-02, -6.7361e-02,  5.6380e-02,
         2.3484e-03, -4.5203e-03,  4.1993e-02,  2.9977e-02, -1.2228e-02,
         2.8904e-03, -1.7870e-02, -1.3307e-02, -4.5424e-02, -3.1245e-02,
         4.0651e-03,  1.0091e-01,  6.3333e-02,  1.5903e-01,  9.9152e-02,
        -2.0661e-02,  6.4784e-03,  1.3163e-03,  2.6181e-02, -9.9187e-03,
         1.4386e-02, -4.5888e-02,  5.6548e-02,  3.5045e-02,  5.5262e-02,
         3.0622e-02,  9.1758e-03, -1.0747e-01,  5.5859e-03, -5.0639e-02],
       device='cuda:0')
  • 捕获了性别信息

  • 捕获了首都信息

print(torch.nn.functional.cosine_similarity(
    word2vec('国王')-word2vec('王妃'), 
    word2vec('男')-word2vec('女'), dim=0)
      )
print(torch.nn.functional.cosine_similarity(
    word2vec('公主')-word2vec('王妃'), 
    word2vec('男')-word2vec('女'), dim=0)
      )
tensor(0.1429, device='cuda:0')
tensor(0.0366, device='cuda:0')
print(torch.nn.functional.cosine_similarity(
    word2vec('日本')-word2vec('东京'), 
    word2vec('中国')-word2vec('北京'), dim=0)
      )
print(torch.nn.functional.cosine_similarity(
    word2vec('韩国')-word2vec('东京'), 
    word2vec('中国')-word2vec('北京'), dim=0)
      )
tensor(0.4674, device='cuda:0')
tensor(0.3933, device='cuda:0')
  • 计算相似词

# 单个词
print(word2vec.most_similar('北京')) 
print('\n')
{'上海': 0.6443496942520142, '天津': 0.6384099721908569, '西安': 0.611718475818634, '南京': 0.6113559603691101, '北京市': 0.6093109846115112, '海淀': 0.6049214601516724, '广州': 0.5977935791015625, '京城': 0.5955069661140442, '沈阳': 0.5865166187286377, '深圳': 0.580772876739502}

5.2 获取基于上下文的词向量:语言模型的隐藏层表征#

同样以1.3中调用的gpt2-chinese-cluecorpussmall为例

from transformers import BertTokenizer, GPT2LMHeadModel, TextGenerationPipeline
ckpt_path = "uer/gpt2-chinese-cluecorpussmall" # checkpoint模型路径
tokenizer = BertTokenizer.from_pretrained(ckpt_path) # 分词器
model = GPT2LMHeadModel.from_pretrained(ckpt_path) # 语言模型
model.config.output_hidden_states = True
inputs = tokenizer('小明喜欢吃西瓜。小明喜欢打篮球。小明经常去花店', return_tensors="pt")
outputs = model(**inputs)

print('\n' + '='*10 + '最后一层输出的内隐表征维度: ' + '='*10)
print(str(outputs.hidden_states[-1].shape) + '  1 x 输入字数 x 表征维度')
==========最后一层输出的内隐表征维度: ==========
torch.Size([1, 25, 768])  1 x 输入字数 x 表征维度