import jieba
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, RegexpTokenizer
def filtering_en(file):
with open(file, 'r', encoding='utf-8') as f:
tokenizer = RegexpTokenizer("[a-z]{3,}") # select only words, remove other symbols
tokens = tokenizer.tokenize(f.read()) # tokenize the whole text saved as one single string
# & return a list of tokens
sw = stopwords.words("english") # yield a list of English stopwords
new_sw = [
'doi', 'oogle', 'google', 'research', 'often',
'also', 'ildschut', 'atcho','made','might', 'would','ostalgia',
'ersonal', 'whether', 'example', 'owever', 'outledge','nticipatory','meekes',
'researchers','edikides','toward','cholar','sychol','esearch','sychology',
'among','much','heung','make','rump','says','even','makes','could','used',
'rndt','page','text','many','ross','helps'
]
sw.extend(new_sw) # add new stopwords to list
filtered_list = [w for w in tokens if not w in sw and len(w) >=4]
# get complement: all tk - stopwords - tk < 4 = tk that we want
fq = Counter(filtered_list) # calculate word frequency in filtered list
topwords = fq.most_common(100) # select top 100 most frequent words for wordcloud making
# return a list made up of tuples
return dict(topwords)
dict_en = filtering_en('../itrameur/contextes-eng.txt')
# création du wordcloud anglais pour le bouton d'entrée au tableau anglais
wc_bouton = WordCloud(
background_color='honeydew',
scale=4,
)
wc_bouton.generate_from_frequencies(dict_en)
wc_bouton.to_file('../images/tab-tableaux/en.png')
plt.imshow(wc_bouton)
plt.axis("off")
plt.show()
# création du wordcloud anglais avec le masque 'lune'
masque = np.array(Image.open('../images/lune.jpeg'))
wc_masque = WordCloud(
background_color='white',
scale=4,
font_path='/Library/Fonts/Brill-Roman.ttf',
mask=masque,
contour_color='yellow',
contour_width=36
)
wc_masque.generate_from_frequencies(dict_en)
wc_masque.to_file('../images/image-en.png')
plt.imshow(wc_masque)
plt.axis("off")
plt.show()
# CHINOIS : Fichier dump à segmenter, pour faire deux nuages de mots chinois
with open('../itrameur/contextes-ch.txt', 'r', encoding='UTF-8') as fR_wc:
with open('../itrameur/contextes-ch_seg.txt', 'w', encoding='UTF-8') as fW_wc:
line1 = fR_wc.read() # open & read input file
seg_list1 = jieba.cut(line1, cut_all=False, HMM=True) # tokenize
fW_wc.write(' '.join(seg_list1)) # output file
Building prefix dict from the default dictionary ... Loading model from cache /var/folders/mz/ywhlr0_56p13hvg1tn3_4w640000gn/T/jieba.cache Loading model cost 0.573 seconds. Prefix dict has been built successfully.
# CHINOIS : Fichier dump à segmenter, pour charger sur iTrameur
with open('../itrameur/dumps-text-ch.txt', 'r', encoding='UTF-8') as fR_itr:
with open('../itrameur/dumps-text-ch_seg.txt', 'w', encoding='UTF-8') as fW_itr:
line2 = fR_itr.read() # open & read input file
seg_list2 = jieba.cut(line2, cut_all=False, HMM=True) # tokenize
fW_itr.write(' '.join(seg_list2)) # output file
# Filtration du texte chinois en excluant les mots vides
# Retourner une liste de 100 mots les plus fréquents pour la création des nuages de mots
def filtering_ch(f_seg, f_sw):
with open(f_seg, 'r', encoding='utf-8') as f_seg:
tokenizer = RegexpTokenizer("[\u4e00-\u9fa5]{2,}") # regexp: match Chinese characters (len >= 2)
tokens = tokenizer.tokenize(f_seg.read())
with open(f_sw, 'r', encoding='utf-8') as f_sw:
sw_list = [line.strip() for line in f_sw.readlines()]
# print(sw_list)
filtered_list = [w for w in tokens if not w in sw_list]
# print(filtered_list)
fq = Counter(filtered_list)
topwords = fq.most_common(100)
return dict(topwords)
dict_ch = filtering_ch(
'../itrameur/contextes-ch_seg.txt',
'../itrameur/mot-vide_ch.txt'
)
# print(dict_ch)
# création de wordcloud chinois pour le bouton d'entrée au tableau chinois
wc_button = WordCloud(
background_color='honeydew',
font_path='/Library/Fonts/SourceHanSerif.ttc',
scale=4
)
wc_button.generate_from_frequencies(dict_ch)
wc_button.to_file ('../images/tab-tableaux/ch.png')
plt.imshow(wc_button)
plt.axis('off')
plt.show()
# création de wordcloud chinois avec le masque "maison"
mask = np.array(Image.open('../images/maison.png'))
wc_mask = WordCloud(
background_color='white',
font_path='/Library/Fonts/SourceHanSerif.ttc',
scale=4,
mask=mask,
contour_color='lightblue',
contour_width=36
)
wc_mask.generate_from_frequencies(dict_ch)
wc_mask.to_file('../images/image-ch.png')
plt.imshow(wc_mask)
plt.axis('off')
plt.show()