版权声明:本文为博主原创文章,未经博主允许不得转载。 https://blog.csdn.net/github_36326955/article/details/81981563
1. 概率化编程
def add_unknown_words(word_vecs, vocab, min_df=1, k=300):
"""
For words that occur in at least min_df documents,
create a separate word vector.
same variance as pre-trained ones
"""
for word in vocab:
if word not in word_vecs and vocab[word] >= min_df:
word_vecs[word] = np.random.uniform(-0.25,0.25,k)
print(word)
def pro(pro):
if random.random()<=pro:
pass
2. 定义一段弃用的代码
有时候有些函数功能我们打算弃用,但是担心版本不兼容,为例保持借口的兼容性,可以仿照下面的代码进行编程
@deprecated("Method will be removed in 4.0.0, use self.wv.__getitem__() instead")
def __getitem__(self, words):
"""
Deprecated. Use self.wv.__getitem__() instead.
Refer to the documentation for `gensim.models.keyedvectors.Word2VecKeyedVectors.__getitem__`
"""
return self.wv.__getitem__(words)
3. 计算文件行数
def count_lines(f):
if path.isfile(f): # Test whether a path is a regular file
num_lines = sum(1 for line in open(f))
"""
上面这行代码相当于:
a=[1 for line in open(f)] # a=[1,1,1,1,1,...,1]
num_lines = sum(a)
"""
return num_lines
else:
return 0
4. 计算一组文件里单词的词频
#! /usr/bin/env python
# -*- coding: utf-8 -*-
from concurrent.futures import ProcessPoolExecutor
from collections import Counter
def count_words(file):
c = Counter()
with open(file, 'r') as f:
for l in f:
words = l.strip().split()
c.update(words)
return c
def count_textfiles(files, workers=1):
c = Counter()
with ProcessPoolExecutor(max_workers=workers) as executor:
for c_ in executor.map(count_words, files):
c.update(c_)
return c
5. 代码计时
from time import time
t0 = time()
your code here
t1 = time()
print('make_directed: added missing edges {}s'.format(t1 - t0))