Browse Source

ajout du format de sortie latex pour le rapport

master
FRANCOIS PELLETIER 11 months ago
parent
commit
1cb7ad75b5
6 changed files with 656 additions and 29 deletions
  1. +2
    -0
      .gitignore
  2. +605
    -21
      Analyse_Articles.ipynb
  3. +1
    -1
      Makefile
  4. +10
    -1
      README.md
  5. +22
    -3
      rapport.md
  6. +16
    -3
      textes_articles.ipynb

+ 2
- 0
.gitignore View File

@@ -123,3 +123,5 @@ dmypy.json
*.Rproj
*.pdf

# fichiers latex
*.tex

+ 605
- 21
Analyse_Articles.ipynb View File

@@ -1,8 +1,19 @@
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"# Analyse des données pour le rapport\n",
"\n",
"## Lecture des fichiers de données et affichage d'un échantillon de données\n",
"\n",
"### Articles"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
@@ -11,7 +22,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
@@ -22,16 +33,191 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 4,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>media</th>\n",
" <th>post_id</th>\n",
" <th>text</th>\n",
" <th>ner_dict</th>\n",
" <th>pos_dict</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>FIG</td>\n",
" <td>5dc7ac7f359e2-10157143278136339</td>\n",
" <td>L'ancien international de football Vikash Dhor...</td>\n",
" <td>{('Vikash', 'PERSON'): 2, ('Dhorasoo', 'PERSON...</td>\n",
" <td>{('ancien', 'ADJ'): 3, ('international', 'NOUN...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>FIG</td>\n",
" <td>5dc7acd0d44b1-10157142962296339</td>\n",
" <td>Les personnes qui iront manifester dimanche 10...</td>\n",
" <td>{('10', 'NUMBER'): 2, ('La', 'ORGANIZATION'): ...</td>\n",
" <td>{('personnes', 'NOUN'): 2, ('iront', 'VERB'): ...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>FIG</td>\n",
" <td>5dc7adde8bd8e-10157142482251339</td>\n",
" <td>Selon Jason Farago, la Joconde prend le musée ...</td>\n",
" <td>{('Jason', 'PERSON'): 8, ('Farago', 'PERSON'):...</td>\n",
" <td>{('Jason', 'PROPN'): 8, ('Farago', 'PROPN'): 8...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>FIG</td>\n",
" <td>5dc7ab8df19a0-10157144491741339</td>\n",
" <td>We're just checking that you want to follow a ...</td>\n",
" <td>{}</td>\n",
" <td>{('We', 'PROPN'): 1, ('just', 'PROPN'): 1, ('c...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>FIG</td>\n",
" <td>5dc7ac188a6d6-10157143773291339</td>\n",
" <td>Les défections se sont enchaînées, et peu de p...</td>\n",
" <td>{('Jean-Luc', 'PERSON'): 3, ('Mélenchon', 'PER...</td>\n",
" <td>{('défections', 'NOUN'): 2, ('enchaînées', 'VE...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>FIG</td>\n",
" <td>5dc7ac51516dc-10157143472656339</td>\n",
" <td>We're just checking that you want to follow a ...</td>\n",
" <td>{}</td>\n",
" <td>{('We', 'PROPN'): 1, ('just', 'PROPN'): 1, ('c...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>FIG</td>\n",
" <td>5dc7ab9fe4530-10157144373586339</td>\n",
" <td>FIGAROVOX/TRIBUNE - Les derniers chiffres offi...</td>\n",
" <td>{('Claude', 'PERSON'): 2, ('Goasguen', 'PERSON...</td>\n",
" <td>{('FIGAROVOX', 'PROPN'): 1, ('TRIBUNE', 'NOUN'...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>7</td>\n",
" <td>FIG</td>\n",
" <td>5dc7ae3950eea-10157141592561339</td>\n",
" <td>La DGSI est chef de file de la lutte antiterro...</td>\n",
" <td>{('France', 'LOCATION'): 1, ('1200', 'DATE'): ...</td>\n",
" <td>{('DGSI', 'PROPN'): 2, ('est', 'VERB'): 2, ('c...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>8</td>\n",
" <td>FIG</td>\n",
" <td>5dc7ac9063012-10157143218116339</td>\n",
" <td>Le voyage en Chine est devenu en ce début de X...</td>\n",
" <td>{('Chine', 'ORGANIZATION'): 1, ('New', 'LOCATI...</td>\n",
" <td>{('voyage', 'NOUN'): 3, ('Chine', 'PROPN'): 1,...</td>\n",
" </tr>\n",
" <tr>\n",
" <td>9</td>\n",
" <td>FIG</td>\n",
" <td>5dc7adf1bf8ff-10157142446816339</td>\n",
" <td>Les nouvelles habitudes de consommation font s...</td>\n",
" <td>{('Carrefour', 'ORGANIZATION'): 2, ('Auchan', ...</td>\n",
" <td>{('nouvelles', 'NOUN'): 1, ('habitudes', 'ADJ'...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" media post_id \\\n",
"0 FIG 5dc7ac7f359e2-10157143278136339 \n",
"1 FIG 5dc7acd0d44b1-10157142962296339 \n",
"2 FIG 5dc7adde8bd8e-10157142482251339 \n",
"3 FIG 5dc7ab8df19a0-10157144491741339 \n",
"4 FIG 5dc7ac188a6d6-10157143773291339 \n",
"5 FIG 5dc7ac51516dc-10157143472656339 \n",
"6 FIG 5dc7ab9fe4530-10157144373586339 \n",
"7 FIG 5dc7ae3950eea-10157141592561339 \n",
"8 FIG 5dc7ac9063012-10157143218116339 \n",
"9 FIG 5dc7adf1bf8ff-10157142446816339 \n",
"\n",
" text \\\n",
"0 L'ancien international de football Vikash Dhor... \n",
"1 Les personnes qui iront manifester dimanche 10... \n",
"2 Selon Jason Farago, la Joconde prend le musée ... \n",
"3 We're just checking that you want to follow a ... \n",
"4 Les défections se sont enchaînées, et peu de p... \n",
"5 We're just checking that you want to follow a ... \n",
"6 FIGAROVOX/TRIBUNE - Les derniers chiffres offi... \n",
"7 La DGSI est chef de file de la lutte antiterro... \n",
"8 Le voyage en Chine est devenu en ce début de X... \n",
"9 Les nouvelles habitudes de consommation font s... \n",
"\n",
" ner_dict \\\n",
"0 {('Vikash', 'PERSON'): 2, ('Dhorasoo', 'PERSON... \n",
"1 {('10', 'NUMBER'): 2, ('La', 'ORGANIZATION'): ... \n",
"2 {('Jason', 'PERSON'): 8, ('Farago', 'PERSON'):... \n",
"3 {} \n",
"4 {('Jean-Luc', 'PERSON'): 3, ('Mélenchon', 'PER... \n",
"5 {} \n",
"6 {('Claude', 'PERSON'): 2, ('Goasguen', 'PERSON... \n",
"7 {('France', 'LOCATION'): 1, ('1200', 'DATE'): ... \n",
"8 {('Chine', 'ORGANIZATION'): 1, ('New', 'LOCATI... \n",
"9 {('Carrefour', 'ORGANIZATION'): 2, ('Auchan', ... \n",
"\n",
" pos_dict \n",
"0 {('ancien', 'ADJ'): 3, ('international', 'NOUN... \n",
"1 {('personnes', 'NOUN'): 2, ('iront', 'VERB'): ... \n",
"2 {('Jason', 'PROPN'): 8, ('Farago', 'PROPN'): 8... \n",
"3 {('We', 'PROPN'): 1, ('just', 'PROPN'): 1, ('c... \n",
"4 {('défections', 'NOUN'): 2, ('enchaînées', 'VE... \n",
"5 {('We', 'PROPN'): 1, ('just', 'PROPN'): 1, ('c... \n",
"6 {('FIGAROVOX', 'PROPN'): 1, ('TRIBUNE', 'NOUN'... \n",
"7 {('DGSI', 'PROPN'): 2, ('est', 'VERB'): 2, ('c... \n",
"8 {('voyage', 'NOUN'): 3, ('Chine', 'PROPN'): 1,... \n",
"9 {('nouvelles', 'NOUN'): 1, ('habitudes', 'ADJ'... "
]
},
"execution_count": 4,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"textes_articles_df.head(10)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Commentaires"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
@@ -42,7 +228,7 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
@@ -51,9 +237,335 @@
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 7,
"metadata": {},
"outputs": [],
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>comment_id</th>\n",
" <th>nested_id</th>\n",
" <th>name</th>\n",
" <th>id</th>\n",
" <th>date</th>\n",
" <th>likes</th>\n",
" <th>comment</th>\n",
" <th>media</th>\n",
" <th>post_id</th>\n",
" <th>list_names</th>\n",
" <th>auteurs_referes</th>\n",
" <th>comment_clean</th>\n",
" <th>ner_dict</th>\n",
" <th>pos_dict</th>\n",
" <th>emoji_dict</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <td>0</td>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>Ycf Bullit</td>\n",
" <td>ID: 100000615866313</td>\n",
" <td>2019-11-09 14:17:13</td>\n",
" <td>0</td>\n",
" <td>C'est une blague mdr 🤣🤣🤣🤣🤣</td>\n",
" <td>FIG</td>\n",
" <td>5dc7ac7f359e2-10157143278136339</td>\n",
" <td>[Ycf Bullit]</td>\n",
" <td>[]</td>\n",
" <td>C'est une blague mdr 🤣🤣🤣🤣🤣</td>\n",
" <td>{}</td>\n",
" <td>{('est', 'VERB'): 1, ('blague', 'NOUN'): 1, ('...</td>\n",
" <td>{':rolling_on_the_floor_laughing:': [5, 6, 7]}</td>\n",
" </tr>\n",
" <tr>\n",
" <td>1</td>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>Steph Alcazar</td>\n",
" <td>ID: 100001175077263</td>\n",
" <td>2019-11-09 14:17:34</td>\n",
" <td>0</td>\n",
" <td>La seule question c'est de savoir s'il fera pl...</td>\n",
" <td>FIG</td>\n",
" <td>5dc7ac7f359e2-10157143278136339</td>\n",
" <td>[Steph Alcazar]</td>\n",
" <td>[]</td>\n",
" <td>La seule question c'est de savoir s'il fera pl...</td>\n",
" <td>{}</td>\n",
" <td>{('seule', 'ADJ'): 1, ('question', 'NOUN'): 1,...</td>\n",
" <td>{}</td>\n",
" </tr>\n",
" <tr>\n",
" <td>2</td>\n",
" <td>3.0</td>\n",
" <td>0</td>\n",
" <td>Töm Müstäine</td>\n",
" <td>ID: 1365879404</td>\n",
" <td>2019-11-09 14:17:51</td>\n",
" <td>0</td>\n",
" <td>Romain Debrigode l info du jour qui fait plaise</td>\n",
" <td>FIG</td>\n",
" <td>5dc7ac7f359e2-10157143278136339</td>\n",
" <td>[Töm Müstäine]</td>\n",
" <td>[]</td>\n",
" <td>Romain Debrigode l info du jour qui fait plaise</td>\n",
" <td>{('Romain', 'PERSON'): 1, ('Debrigode', 'PERSO...</td>\n",
" <td>{('Romain', 'PROPN'): 1, ('Debrigode', 'PROPN'...</td>\n",
" <td>{}</td>\n",
" </tr>\n",
" <tr>\n",
" <td>3</td>\n",
" <td>4.0</td>\n",
" <td>0</td>\n",
" <td>Pierre Crouzet</td>\n",
" <td>ID: 100000270292007</td>\n",
" <td>2019-11-09 14:18:06</td>\n",
" <td>0</td>\n",
" <td>Vasanth Toure 😍</td>\n",
" <td>FIG</td>\n",
" <td>5dc7ac7f359e2-10157143278136339</td>\n",
" <td>[Pierre Crouzet, Vasanth Toure]</td>\n",
" <td>['Vasanth Toure']</td>\n",
" <td>😍</td>\n",
" <td>{}</td>\n",
" <td>{}</td>\n",
" <td>{}</td>\n",
" </tr>\n",
" <tr>\n",
" <td>4</td>\n",
" <td>4.0</td>\n",
" <td>1</td>\n",
" <td>Vasanth Toure</td>\n",
" <td>ID: 100001494607801</td>\n",
" <td>2019-11-09 14:20:57</td>\n",
" <td>0</td>\n",
" <td>Pierre Crouzet Paris n'est pas prêt encore...</td>\n",
" <td>FIG</td>\n",
" <td>5dc7ac7f359e2-10157143278136339</td>\n",
" <td>[Pierre Crouzet, Vasanth Toure]</td>\n",
" <td>['Pierre Crouzet']</td>\n",
" <td>Paris n'est pas prêt encore...</td>\n",
" <td>{('Paris', 'LOCATION'): 1}</td>\n",
" <td>{('Paris', 'PROPN'): 1, ('est', 'VERB'): 1, ('...</td>\n",
" <td>{}</td>\n",
" </tr>\n",
" <tr>\n",
" <td>5</td>\n",
" <td>4.0</td>\n",
" <td>2</td>\n",
" <td>Pierre Crouzet</td>\n",
" <td>ID: 100000270292007</td>\n",
" <td>2019-11-09 14:26:37</td>\n",
" <td>0</td>\n",
" <td>Vasanth Toure le prochain c’est Adrien Rabiot</td>\n",
" <td>FIG</td>\n",
" <td>5dc7ac7f359e2-10157143278136339</td>\n",
" <td>[Pierre Crouzet, Vasanth Toure]</td>\n",
" <td>['Vasanth Toure']</td>\n",
" <td>le prochain c’est Adrien Rabiot</td>\n",
" <td>{('Adrien', 'PERSON'): 1, ('Rabiot', 'PERSON')...</td>\n",
" <td>{('prochain', 'ADJ'): 1, ('Adrien', 'PROPN'): ...</td>\n",
" <td>{}</td>\n",
" </tr>\n",
" <tr>\n",
" <td>6</td>\n",
" <td>5.0</td>\n",
" <td>0</td>\n",
" <td>Stéphane Pirnaci</td>\n",
" <td>ID: 100008541367302</td>\n",
" <td>2019-11-09 14:18:51</td>\n",
" <td>0</td>\n",
" <td>Mdr</td>\n",
" <td>FIG</td>\n",
" <td>5dc7ac7f359e2-10157143278136339</td>\n",
" <td>[Stéphane Pirnaci]</td>\n",
" <td>[]</td>\n",
" <td>Mdr</td>\n",
" <td>{}</td>\n",
" <td>{}</td>\n",
" <td>{}</td>\n",
" </tr>\n",
" <tr>\n",
" <td>7</td>\n",
" <td>6.0</td>\n",
" <td>0</td>\n",
" <td>Adil Bennani</td>\n",
" <td>ID: 100006432917292</td>\n",
" <td>2019-11-09 14:19:03</td>\n",
" <td>0</td>\n",
" <td>moi je propose mamadou sissoko</td>\n",
" <td>FIG</td>\n",
" <td>5dc7ac7f359e2-10157143278136339</td>\n",
" <td>[Adil Bennani]</td>\n",
" <td>[]</td>\n",
" <td>moi je propose mamadou sissoko</td>\n",
" <td>{}</td>\n",
" <td>{('propose', 'VERB'): 1, ('mamadou', 'NOUN'): ...</td>\n",
" <td>{}</td>\n",
" </tr>\n",
" <tr>\n",
" <td>8</td>\n",
" <td>7.0</td>\n",
" <td>0</td>\n",
" <td>Hadrien De Cournon</td>\n",
" <td>ID: 1131290552</td>\n",
" <td>2019-11-09 14:19:09</td>\n",
" <td>0</td>\n",
" <td>Louis Prt Corentin Corman Victor Mdv ah ouais?</td>\n",
" <td>FIG</td>\n",
" <td>5dc7ac7f359e2-10157143278136339</td>\n",
" <td>[Hadrien De Cournon]</td>\n",
" <td>[]</td>\n",
" <td>Louis Prt Corentin Corman Victor Mdv ah ouais?</td>\n",
" <td>{('Louis', 'PERSON'): 1, ('Prt', 'PERSON'): 1,...</td>\n",
" <td>{('Louis', 'PROPN'): 1, ('Prt', 'PROPN'): 1, (...</td>\n",
" <td>{}</td>\n",
" </tr>\n",
" <tr>\n",
" <td>9</td>\n",
" <td>8.0</td>\n",
" <td>0</td>\n",
" <td>Marwa Larose</td>\n",
" <td>ID: 100022577589611</td>\n",
" <td>2019-11-09 14:19:38</td>\n",
" <td>0</td>\n",
" <td>Marier le foot à la mairie est génial</td>\n",
" <td>FIG</td>\n",
" <td>5dc7ac7f359e2-10157143278136339</td>\n",
" <td>[Marwa Larose]</td>\n",
" <td>[]</td>\n",
" <td>Marier le foot à la mairie est génial</td>\n",
" <td>{('Marier', 'PERSON'): 1}</td>\n",
" <td>{('Marier', 'VERB'): 1, ('foot', 'NOUN'): 1, (...</td>\n",
" <td>{}</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" comment_id nested_id name id \\\n",
"0 1.0 0 Ycf Bullit ID: 100000615866313 \n",
"1 2.0 0 Steph Alcazar ID: 100001175077263 \n",
"2 3.0 0 Töm Müstäine ID: 1365879404 \n",
"3 4.0 0 Pierre Crouzet ID: 100000270292007 \n",
"4 4.0 1 Vasanth Toure ID: 100001494607801 \n",
"5 4.0 2 Pierre Crouzet ID: 100000270292007 \n",
"6 5.0 0 Stéphane Pirnaci ID: 100008541367302 \n",
"7 6.0 0 Adil Bennani ID: 100006432917292 \n",
"8 7.0 0 Hadrien De Cournon ID: 1131290552 \n",
"9 8.0 0 Marwa Larose ID: 100022577589611 \n",
"\n",
" date likes \\\n",
"0 2019-11-09 14:17:13 0 \n",
"1 2019-11-09 14:17:34 0 \n",
"2 2019-11-09 14:17:51 0 \n",
"3 2019-11-09 14:18:06 0 \n",
"4 2019-11-09 14:20:57 0 \n",
"5 2019-11-09 14:26:37 0 \n",
"6 2019-11-09 14:18:51 0 \n",
"7 2019-11-09 14:19:03 0 \n",
"8 2019-11-09 14:19:09 0 \n",
"9 2019-11-09 14:19:38 0 \n",
"\n",
" comment media \\\n",
"0 C'est une blague mdr 🤣🤣🤣🤣🤣 FIG \n",
"1 La seule question c'est de savoir s'il fera pl... FIG \n",
"2 Romain Debrigode l info du jour qui fait plaise FIG \n",
"3 Vasanth Toure 😍 FIG \n",
"4 Pierre Crouzet Paris n'est pas prêt encore... FIG \n",
"5 Vasanth Toure le prochain c’est Adrien Rabiot FIG \n",
"6 Mdr FIG \n",
"7 moi je propose mamadou sissoko FIG \n",
"8 Louis Prt Corentin Corman Victor Mdv ah ouais? FIG \n",
"9 Marier le foot à la mairie est génial FIG \n",
"\n",
" post_id list_names \\\n",
"0 5dc7ac7f359e2-10157143278136339 [Ycf Bullit] \n",
"1 5dc7ac7f359e2-10157143278136339 [Steph Alcazar] \n",
"2 5dc7ac7f359e2-10157143278136339 [Töm Müstäine] \n",
"3 5dc7ac7f359e2-10157143278136339 [Pierre Crouzet, Vasanth Toure] \n",
"4 5dc7ac7f359e2-10157143278136339 [Pierre Crouzet, Vasanth Toure] \n",
"5 5dc7ac7f359e2-10157143278136339 [Pierre Crouzet, Vasanth Toure] \n",
"6 5dc7ac7f359e2-10157143278136339 [Stéphane Pirnaci] \n",
"7 5dc7ac7f359e2-10157143278136339 [Adil Bennani] \n",
"8 5dc7ac7f359e2-10157143278136339 [Hadrien De Cournon] \n",
"9 5dc7ac7f359e2-10157143278136339 [Marwa Larose] \n",
"\n",
" auteurs_referes comment_clean \\\n",
"0 [] C'est une blague mdr 🤣🤣🤣🤣🤣 \n",
"1 [] La seule question c'est de savoir s'il fera pl... \n",
"2 [] Romain Debrigode l info du jour qui fait plaise \n",
"3 ['Vasanth Toure'] 😍 \n",
"4 ['Pierre Crouzet'] Paris n'est pas prêt encore... \n",
"5 ['Vasanth Toure'] le prochain c’est Adrien Rabiot \n",
"6 [] Mdr \n",
"7 [] moi je propose mamadou sissoko \n",
"8 [] Louis Prt Corentin Corman Victor Mdv ah ouais? \n",
"9 [] Marier le foot à la mairie est génial \n",
"\n",
" ner_dict \\\n",
"0 {} \n",
"1 {} \n",
"2 {('Romain', 'PERSON'): 1, ('Debrigode', 'PERSO... \n",
"3 {} \n",
"4 {('Paris', 'LOCATION'): 1} \n",
"5 {('Adrien', 'PERSON'): 1, ('Rabiot', 'PERSON')... \n",
"6 {} \n",
"7 {} \n",
"8 {('Louis', 'PERSON'): 1, ('Prt', 'PERSON'): 1,... \n",
"9 {('Marier', 'PERSON'): 1} \n",
"\n",
" pos_dict \\\n",
"0 {('est', 'VERB'): 1, ('blague', 'NOUN'): 1, ('... \n",
"1 {('seule', 'ADJ'): 1, ('question', 'NOUN'): 1,... \n",
"2 {('Romain', 'PROPN'): 1, ('Debrigode', 'PROPN'... \n",
"3 {} \n",
"4 {('Paris', 'PROPN'): 1, ('est', 'VERB'): 1, ('... \n",
"5 {('prochain', 'ADJ'): 1, ('Adrien', 'PROPN'): ... \n",
"6 {} \n",
"7 {('propose', 'VERB'): 1, ('mamadou', 'NOUN'): ... \n",
"8 {('Louis', 'PROPN'): 1, ('Prt', 'PROPN'): 1, (... \n",
"9 {('Marier', 'VERB'): 1, ('foot', 'NOUN'): 1, (... \n",
"\n",
" emoji_dict \n",
"0 {':rolling_on_the_floor_laughing:': [5, 6, 7]} \n",
"1 {} \n",
"2 {} \n",
"3 {} \n",
"4 {} \n",
"5 {} \n",
"6 {} \n",
"7 {} \n",
"8 {} \n",
"9 {} "
]
},
"execution_count": 7,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"commentaires_df.head(10)"
]
@@ -66,54 +578,126 @@
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Nombre d'articles"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 34,
"metadata": {},
"outputs": [],
"source": [
"textes_articles_df.groupby(\"media\").count()"
"decompte_medias = textes_articles_df.groupby(\"media\").count()[[\"post_id\"]]"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 35,
"metadata": {},
"outputs": [],
"source": [
"commentaires_df.groupby(\"media\").count()"
"decompte_medias.columns = [\"Nombre de publications\"]"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 36,
"metadata": {},
"outputs": [],
"source": [
"nb_comm = commentaires_df[\"emoji_dict\"].count()\n",
"nb_comm"
"decompte_medias.to_latex(\"decompte_articles_medias.tex\")"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Nombre de commentaires total par média"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 37,
"metadata": {},
"outputs": [],
"source": [
"nb_comm_emoji = sum(commentaires_df[\"emoji_dict\"].apply(lambda x: len(x)) == 1)\n",
"nb_comm_emoji"
"decompte_commentaires = commentaires_df.groupby(\"media\").count()[[\"comment_id\"]]"
]
},
{
"cell_type": "code",
"execution_count": null,
"execution_count": 38,
"metadata": {},
"outputs": [],
"source": [
"nb_comm_emoji/nb_comm"
"decompte_commentaires.columns = [\"Nombre de commentaires\"]"
]
},
{
"cell_type": "code",
"execution_count": 39,
"metadata": {},
"outputs": [],
"source": [
"decompte_commentaires.to_latex(\"decompte_comm_medias.tex\",)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### Proportion de commentaires contenant des emojis"
]
},
{
"cell_type": "code",
"execution_count": 31,
"metadata": {},
"outputs": [],
"source": [
"nb_comm = commentaires_df[\"emoji_dict\"].count()\n",
"nb_comm_emoji = sum(commentaires_df[\"emoji_dict\"].apply(lambda x: len(x)) == 1)"
]
},
{
"cell_type": "code",
"execution_count": 33,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.13"
]
},
"execution_count": 33,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"round(nb_comm_emoji/nb_comm,2)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
@@ -136,7 +720,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
"version": "3.7.4"
}
},
"nbformat": 4,


+ 1
- 1
Makefile View File

@@ -1,2 +1,2 @@
build: rapport.md
pandoc --filter=pandoc-citeproc rapport.md -o rapport.pdf
pandoc --filter=pandoc-citeproc -f markdown+raw_tex+latex_macros rapport.md -o rapport.pdf

+ 10
- 1
README.md View File

@@ -1,3 +1,12 @@
# nlp_a2019_tp3

Projet de fin de session
## Installation des dépendances du projet

Installer Anaconda3

pip install newspaper3k
pip install emoji

## Compilation du rapport

make

+ 22
- 3
rapport.md View File

@@ -12,6 +12,12 @@ fontsize: 12pt
geometry: margin=1in
bibliography: NLP-TP3.bib
csl: transactions-on-speech-and-language-processing.csl
fig_caption: yes
header-includes: |
\usepackage{float}
\usepackage{booktabs,siunitx}
\floatplacement{figure}{H}

---

\pagebreak
@@ -137,9 +143,21 @@ Selon les observations de Liebeskind et al. [@liebeskind_comment_2018], les prin

## Description des corpus de textes

Nous analyserons les articles provenant des pages Facebook de trois médias écrits francophones : Le Figaro (France), Radio-Canada (Canada) et TVA Nouvelles (Canada). Pour chacun de ces médias, nous avons respectivement 25, 22 et 24 publications contenant un lien vers un article journalistique.
Nous analyserons les articles provenant des pages Facebook de trois médias écrits francophones : Le Figaro (FIG), Radio-Canada (RC) et TVA Nouvelles (TVA). Pour chacun de ces médias, nous avons respectivement une publication Facebook contenant un lien vers un article journalistique, ainsi qu'un corpus de commentaires extraits depuis celle-ci.

\begin{figure}
\centering
\caption{Décompte des articles par médias}
\input{decompte_articles_medias}
\end{figure}

Le premier corpus étudié est constitué du texte de chacun des articles qui sont liés dans les publications (l'utilisateur de Facebook devant cliquer sur le lien pour y accéder). Le titre de l'article n'est pas inclus dans ce corpus. Le second corpus est constitué d'un ensemble de commentaires publiés par des utilisateurs du réseau social et associés à chacune des publications précédentes.

Le premier corpus étudié est constitué du texte de chacun des articles qui sont liés dans les publications (l'utilisateur de Facebook devant cliquer sur le lien pour y accéder). Le titre de l'article n'est pas inclus dans ce corpus. Le second corpus est constitué d'un ensemble de commentaires publiés par des utilisateurs du réseau social et associés à chacune des publications précédentes. Il y a respectivement 7155, 2947 et 6262 commentaires pour chacun des trois médias écrits.
\begin{figure}
\centering
\caption{Décompte des commentaires par médias}
\input{decompte_comm_medias}
\end{figure}

Ces deux corpus ont été créés à l'aide des données de commentaires extraites depuis l'application en ligne exportcomments.com @noauthor_exportcomments.com_2019 dans des fichiers XLSX. Les fichiers ont par la suite été utilisés par les programmes Python suivants :

@@ -174,7 +192,8 @@ Expertise et prise de position

Intertextualité

- Réponse à un autre commentaire
- Mention de l'auteur d'un autre commentaire
- Présence de pronoms à la deuxième personne

## Méthodologie et algorithmes



+ 16
- 3
textes_articles.ipynb View File

@@ -2,9 +2,22 @@
"cells": [
{
"cell_type": "code",
"execution_count": null,
"execution_count": 1,
"metadata": {},
"outputs": [],
"outputs": [
{
"ename": "ModuleNotFoundError",
"evalue": "No module named 'newspaper'",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mModuleNotFoundError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-1-e35ec13ebf3b>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mparsing_functions\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mpf\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 2\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mre\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 3\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mpandas\u001b[0m \u001b[0;32mas\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mtime\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;32m~/nlp_a2019_tp3/parsing_functions.py\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[1;32m 4\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0mrequests\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 5\u001b[0m \u001b[0;32mfrom\u001b[0m \u001b[0murllib\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mparse\u001b[0m \u001b[0;32mimport\u001b[0m \u001b[0munquote\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0;32m----> 6\u001b[0;31m \u001b[0;32mimport\u001b[0m \u001b[0mnewspaper\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m\u001b[1;32m 7\u001b[0m \u001b[0;34m\u001b[0m\u001b[0m\n\u001b[1;32m 8\u001b[0m \u001b[0;32mdef\u001b[0m \u001b[0mget_comments\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mfile_path\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m:\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n",
"\u001b[0;31mModuleNotFoundError\u001b[0m: No module named 'newspaper'"
]
}
],
"source": [
"import parsing_functions as pf\n",
"import re\n",
@@ -76,7 +89,7 @@
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
"version": "3.7.4"
}
},
"nbformat": 4,


Loading…
Cancel
Save