Projet de fin de session
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

304 lines
7.0 KiB

  1. {
  2. "cells": [
  3. {
  4. "cell_type": "code",
  5. "execution_count": null,
  6. "metadata": {},
  7. "outputs": [],
  8. "source": [
  9. "# Aller chercher les synsets\n",
  10. "# Variantes morphologiques\n",
  11. "# Enlever les noms des autres commenteux\n",
  12. "# Traiter les émoticones"
  13. ]
  14. },
  15. {
  16. "cell_type": "code",
  17. "execution_count": null,
  18. "metadata": {},
  19. "outputs": [],
  20. "source": [
  21. "import pandas as pd\n",
  22. "import numpy as np\n",
  23. "from nltk.corpus import stopwords\n",
  24. "from nltk.tokenize import TweetTokenizer\n",
  25. "from nltk.parse import CoreNLPParser\n",
  26. "import re\n",
  27. "import pickle\n",
  28. "import emoji\n",
  29. "import pretraitement as pr"
  30. ]
  31. },
  32. {
  33. "cell_type": "markdown",
  34. "metadata": {},
  35. "source": [
  36. "## Tokenisation des commentaires\n",
  37. "\n",
  38. "Utilisation du TweetTokenizer, car il est davantage adapté au contenu des utilisateurs sur les médias sociaux"
  39. ]
  40. },
  41. {
  42. "cell_type": "code",
  43. "execution_count": null,
  44. "metadata": {},
  45. "outputs": [],
  46. "source": [
  47. "# Création de l'objet Tokenizer\n",
  48. "tok = TweetTokenizer(preserve_case=True)"
  49. ]
  50. },
  51. {
  52. "cell_type": "code",
  53. "execution_count": null,
  54. "metadata": {},
  55. "outputs": [],
  56. "source": [
  57. "pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')"
  58. ]
  59. },
  60. {
  61. "cell_type": "code",
  62. "execution_count": null,
  63. "metadata": {},
  64. "outputs": [],
  65. "source": [
  66. "ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')"
  67. ]
  68. },
  69. {
  70. "cell_type": "code",
  71. "execution_count": null,
  72. "metadata": {},
  73. "outputs": [],
  74. "source": [
  75. "commentaires_df = pd.read_csv(\"refined_data/commentaires_df.csv\")"
  76. ]
  77. },
  78. {
  79. "cell_type": "code",
  80. "execution_count": null,
  81. "metadata": {},
  82. "outputs": [],
  83. "source": [
  84. "commentaires_df.head(10)"
  85. ]
  86. },
  87. {
  88. "cell_type": "code",
  89. "execution_count": null,
  90. "metadata": {},
  91. "outputs": [],
  92. "source": [
  93. "#suppression de la première colonne qui ne sert à rien\n",
  94. "del commentaires_df['Unnamed: 0']"
  95. ]
  96. },
  97. {
  98. "cell_type": "markdown",
  99. "metadata": {},
  100. "source": [
  101. "## Noms des auteurs\n",
  102. "\n",
  103. "Extraction du nom des auteurs pour chaque commentaire et ses sous-commentaires"
  104. ]
  105. },
  106. {
  107. "cell_type": "code",
  108. "execution_count": null,
  109. "metadata": {},
  110. "outputs": [],
  111. "source": [
  112. "names_df = pd.DataFrame(commentaires_df.groupby(['post_id','comment_id'])['name'], columns=['post_comment','list_names'])"
  113. ]
  114. },
  115. {
  116. "cell_type": "code",
  117. "execution_count": null,
  118. "metadata": {},
  119. "outputs": [],
  120. "source": [
  121. "names_df['list_names'] = names_df.apply(lambda x: list(set(x['list_names'])), axis=1)"
  122. ]
  123. },
  124. {
  125. "cell_type": "code",
  126. "execution_count": null,
  127. "metadata": {},
  128. "outputs": [],
  129. "source": [
  130. "names_df['post_id'] = names_df.apply(lambda x: x['post_comment'][0], axis=1)\n",
  131. "names_df['comment_id'] = names_df.apply(lambda x: x['post_comment'][1], axis=1)"
  132. ]
  133. },
  134. {
  135. "cell_type": "code",
  136. "execution_count": null,
  137. "metadata": {},
  138. "outputs": [],
  139. "source": [
  140. "del names_df['post_comment']"
  141. ]
  142. },
  143. {
  144. "cell_type": "code",
  145. "execution_count": null,
  146. "metadata": {},
  147. "outputs": [],
  148. "source": [
  149. "names_df.head(10)"
  150. ]
  151. },
  152. {
  153. "cell_type": "markdown",
  154. "metadata": {},
  155. "source": [
  156. "## Traitement du nom des auteurs dans les textes des commentaires"
  157. ]
  158. },
  159. {
  160. "cell_type": "code",
  161. "execution_count": null,
  162. "metadata": {},
  163. "outputs": [],
  164. "source": [
  165. "commentaires_df_names = commentaires_df.merge(names_df)"
  166. ]
  167. },
  168. {
  169. "cell_type": "code",
  170. "execution_count": null,
  171. "metadata": {},
  172. "outputs": [],
  173. "source": [
  174. "def list_auteurs_referes(comment,names):\n",
  175. " auteurs_referes = []\n",
  176. " try:\n",
  177. " if len(names) > 0:\n",
  178. " for i in range(len(names)):\n",
  179. " if (comment.find(names[i]) >=0):\n",
  180. " auteurs_referes.append(names[i])\n",
  181. " return list(set(auteurs_referes))\n",
  182. " except:\n",
  183. " return auteurs_referes\n",
  184. "\n",
  185. "def remove_names(comment,names):\n",
  186. " try:\n",
  187. " if len(names) > 0:\n",
  188. " for i in range(len(names)):\n",
  189. " comment = comment.replace(names[i],'')\n",
  190. " return comment\n",
  191. " except:\n",
  192. " return comment"
  193. ]
  194. },
  195. {
  196. "cell_type": "markdown",
  197. "metadata": {},
  198. "source": [
  199. "## Nettoyage des commentaires et traitement des émoticones"
  200. ]
  201. },
  202. {
  203. "cell_type": "code",
  204. "execution_count": null,
  205. "metadata": {},
  206. "outputs": [],
  207. "source": [
  208. "commentaires_df_names['auteurs_referes'] = commentaires_df_names.apply(lambda x: str(list_auteurs_referes(x['comment'],x['list_names'])), axis=1)"
  209. ]
  210. },
  211. {
  212. "cell_type": "code",
  213. "execution_count": null,
  214. "metadata": {},
  215. "outputs": [],
  216. "source": [
  217. "commentaires_df_names['comment_clean'] = commentaires_df_names.apply(lambda x: str(remove_names(x['comment'],x['list_names'])), axis=1)"
  218. ]
  219. },
  220. {
  221. "cell_type": "code",
  222. "execution_count": null,
  223. "metadata": {},
  224. "outputs": [],
  225. "source": [
  226. "commentaires_df_names.head(10)"
  227. ]
  228. },
  229. {
  230. "cell_type": "code",
  231. "execution_count": null,
  232. "metadata": {},
  233. "outputs": [],
  234. "source": [
  235. "commentaires_pretraite = []\n",
  236. "compteur=0\n",
  237. "for x in list(commentaires_df_names[\"comment_clean\"]):\n",
  238. " print(str(compteur)+\": \"+x)\n",
  239. " commentaires_pretraite.append(pr.pretraitement(x,tok,ner_tagger,pos_tagger))\n",
  240. " compteur += 1"
  241. ]
  242. },
  243. {
  244. "cell_type": "code",
  245. "execution_count": null,
  246. "metadata": {},
  247. "outputs": [],
  248. "source": [
  249. "commentaires_pretraite"
  250. ]
  251. },
  252. {
  253. "cell_type": "code",
  254. "execution_count": null,
  255. "metadata": {},
  256. "outputs": [],
  257. "source": [
  258. "commentaires_df_names['ner_dict']=[pr.aggreger_ner_tags(article) for article in commentaires_pretraite]\n",
  259. "commentaires_df_names['pos_dict']=[pr.aggreger_pos_tags(article) for article in commentaires_pretraite]\n",
  260. "commentaires_df_names['emoji_dict']=[pr.aggreger_emoji(article) for article in commentaires_pretraite]"
  261. ]
  262. },
  263. {
  264. "cell_type": "code",
  265. "execution_count": null,
  266. "metadata": {},
  267. "outputs": [],
  268. "source": [
  269. "f = open(\"pickle/commentaires_df.pickle\",\"wb\")\n",
  270. "pickle.dump(commentaires_df_names,f)\n",
  271. "f.close()"
  272. ]
  273. },
  274. {
  275. "cell_type": "code",
  276. "execution_count": null,
  277. "metadata": {},
  278. "outputs": [],
  279. "source": []
  280. }
  281. ],
  282. "metadata": {
  283. "kernelspec": {
  284. "display_name": "Python 3",
  285. "language": "python",
  286. "name": "python3"
  287. },
  288. "language_info": {
  289. "codemirror_mode": {
  290. "name": "ipython",
  291. "version": 3
  292. },
  293. "file_extension": ".py",
  294. "mimetype": "text/x-python",
  295. "name": "python",
  296. "nbconvert_exporter": "python",
  297. "pygments_lexer": "ipython3",
  298. "version": "3.7.4"
  299. }
  300. },
  301. "nbformat": 4,
  302. "nbformat_minor": 4
  303. }