Projet de fin de session
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

54 lines
2.3 KiB

  1. import emoji
  2. from nltk.tokenize import sent_tokenize
  3. # Prétraitement
  4. def pretraitement(article,tok,ner_tagger,pos_tagger):
  5. # tokeniser par phrases
  6. article_sentences = sent_tokenize(article)
  7. article_ner_tokens = []
  8. article_pos_tokens = []
  9. article_emoji_tokens = []
  10. for sentence in article_sentences:
  11. try:
  12. if len(sentence) > 0:
  13. # Tokeniser
  14. sentence_tokens = tok.tokenize(sentence)
  15. sentence_tokens = [emoji.demojize(token) for token in sentence_tokens if len(token)>0]
  16. if len(sentence_tokens) > 0:
  17. emoji_tokens = [(token,i) for i, token in enumerate(sentence_tokens,1) if token[0] == ":"]
  18. sentence_tokens = [token for token in sentence_tokens if token[0] != ":"]
  19. if len(sentence_tokens) > 0:
  20. # Assembler les entités nommées et colocations
  21. sentence_ner = ner_tagger.tag(sentence_tokens)
  22. ner_tokens = [ner_token for ner_token in sentence_ner if ner_token[1] != 'O']
  23. # Supprimer les classes fermées avec un POS
  24. sentence_pos = pos_tagger.tag(sentence_tokens)
  25. pos_tokens = [pos_token for pos_token in sentence_pos if pos_token[1] in ['ADJ','ADV','INTJ','NOUN','PROPN','VERB']]
  26. # Ajouter à la liste de phrases tokenisées
  27. article_ner_tokens.append(ner_tokens)
  28. article_pos_tokens.append(pos_tokens)
  29. article_emoji_tokens.append(emoji_tokens)
  30. except:
  31. pass
  32. return article_ner_tokens, article_pos_tokens, article_emoji_tokens
  33. def aggreger_ner_tags(article):
  34. dict_named_entity = {}
  35. for sentence in article[0]:
  36. for entity in sentence:
  37. dict_named_entity[entity] = dict_named_entity.get(entity,0) + 1
  38. return dict_named_entity
  39. def aggreger_pos_tags(article):
  40. dict_pos = {}
  41. for sentence in article[1]:
  42. for pos in sentence:
  43. dict_pos[pos] = dict_pos.get(pos,0) + 1
  44. return dict_pos
  45. def aggreger_emoji(article):
  46. dict_emojis = {}
  47. for sentence in article[2]:
  48. for emoji,loc in sentence:
  49. dict_emojis.setdefault(emoji, []).append(loc)
  50. return dict_emojis