import jieba
from wordcloud import WordCloud, STOPWORDS, ImageColorGenerator
import matplotlib.pyplot as plt
from PIL import Image
import numpy as np
from collections import Counter
from nltk.corpus import stopwords
from nltk.tokenize import word_tokenize, RegexpTokenizer


def filtering_en(file):
    with open(file, 'r', encoding='utf-8') as f:
        tokenizer = RegexpTokenizer("[a-z]{3,}")      # select only words, remove other symbols 
        tokens = tokenizer.tokenize(f.read())   # tokenize the whole text saved as one single string 
                                                # & return a list of tokens 
        sw = stopwords.words("english")   # yield a list of English stopwords
        new_sw = [
            'doi', 'oogle', 'google', 'research', 'often', 
            'also', 'ildschut', 'atcho','made','might', 'would','ostalgia',
            'ersonal', 'whether', 'example', 'owever', 'outledge','nticipatory','meekes',
            'researchers','edikides','toward','cholar','sychol','esearch','sychology',
            'among','much','heung','make','rump','says','even','makes','could','used',
            'rndt','page','text','many','ross','helps'
        ]                                 
        sw.extend(new_sw)               # add new stopwords to list

        filtered_list = [w for w in tokens if not w in sw and len(w) >=4]      
                                        # get complement: all tk - stopwords - tk < 4 = tk that we want

        fq = Counter(filtered_list)     # calculate word frequency in filtered list
        topwords = fq.most_common(100)  # select top 100 most frequent words for wordcloud making
                                        # return a list made up of tuples
    return dict(topwords)

dict_en = filtering_en('../itrameur/contextes-eng.txt')


# création du wordcloud anglais pour le bouton d'entrée au tableau anglais
wc_bouton = WordCloud(
    background_color='honeydew',
    scale=4,
)
wc_bouton.generate_from_frequencies(dict_en)
wc_bouton.to_file('../images/tab-tableaux/en.png')

plt.imshow(wc_bouton)
plt.axis("off")
plt.show()


# création du wordcloud anglais avec le masque 'lune'
masque = np.array(Image.open('../images/lune.jpeg'))
wc_masque = WordCloud(
    background_color='white',
    scale=4,
    font_path='/Library/Fonts/Brill-Roman.ttf',
    mask=masque,
    contour_color='yellow',
    contour_width=36
)
wc_masque.generate_from_frequencies(dict_en)
wc_masque.to_file('../images/image-en.png')


plt.imshow(wc_masque)
plt.axis("off")
plt.show()


# CHINOIS : Fichier dump à segmenter, pour faire deux nuages de mots chinois
with open('../itrameur/contextes-ch.txt', 'r', encoding='UTF-8') as fR_wc:
    with open('../itrameur/contextes-ch_seg.txt', 'w', encoding='UTF-8') as fW_wc:
        line1 = fR_wc.read()    # open & read input file
        seg_list1 = jieba.cut(line1, cut_all=False, HMM=True)   # tokenize
        fW_wc.write(' '.join(seg_list1))    # output file

Building prefix dict from the default dictionary ...
Loading model from cache /var/folders/mz/ywhlr0_56p13hvg1tn3_4w640000gn/T/jieba.cache
Loading model cost 0.573 seconds.
Prefix dict has been built successfully.


# CHINOIS : Fichier dump à segmenter, pour charger sur iTrameur
with open('../itrameur/dumps-text-ch.txt', 'r', encoding='UTF-8') as fR_itr:
    with open('../itrameur/dumps-text-ch_seg.txt', 'w', encoding='UTF-8') as fW_itr:
        line2 = fR_itr.read()    # open & read input file
        seg_list2 = jieba.cut(line2, cut_all=False, HMM=True)   # tokenize
        fW_itr.write(' '.join(seg_list2))    # output file


# Filtration du texte chinois en excluant les mots vides
# Retourner une liste de 100 mots les plus fréquents pour la création des nuages de mots
def filtering_ch(f_seg, f_sw):
    with open(f_seg, 'r', encoding='utf-8') as f_seg:
        tokenizer = RegexpTokenizer("[\u4e00-\u9fa5]{2,}")      # regexp: match Chinese characters (len >= 2)
        tokens = tokenizer.tokenize(f_seg.read())

        with open(f_sw, 'r', encoding='utf-8') as f_sw:
            sw_list = [line.strip() for line in f_sw.readlines()]
        # print(sw_list)
        
        filtered_list = [w for w in tokens if not w in sw_list]
        # print(filtered_list)
        fq = Counter(filtered_list)
        topwords = fq.most_common(100)

    return dict(topwords)

dict_ch = filtering_ch(
    '../itrameur/contextes-ch_seg.txt',
    '../itrameur/mot-vide_ch.txt'
)
# print(dict_ch)


# création de wordcloud chinois pour le bouton d'entrée au tableau chinois
wc_button = WordCloud(
    background_color='honeydew',          
    font_path='/Library/Fonts/SourceHanSerif.ttc',
    scale=4
)
wc_button.generate_from_frequencies(dict_ch)
wc_button.to_file  ('../images/tab-tableaux/ch.png')

plt.imshow(wc_button)
plt.axis('off')
plt.show()


# création de wordcloud chinois avec le masque "maison"
mask = np.array(Image.open('../images/maison.png'))
wc_mask = WordCloud(
    background_color='white',
    font_path='/Library/Fonts/SourceHanSerif.ttc',
    scale=4,
    mask=mask,
    contour_color='lightblue',
    contour_width=36
)
wc_mask.generate_from_frequencies(dict_ch)
wc_mask.to_file('../images/image-ch.png')

plt.imshow(wc_mask)
plt.axis('off')
plt.show()