You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.
 
 
 
 
 
 

597 lines
18 KiB

{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import pickle"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"f = open(\"pickle/textes_articles_df.pickle\",\"rb\")\n",
"textes_articles_df = pickle.load(f)\n",
"f.close()"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0 True\n",
"1 True\n",
"2 True\n",
"3 True\n",
"4 True\n",
"Name: media, dtype: bool"
]
},
"execution_count": 19,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"textes_articles_df.head()"
]
},
{
"cell_type": "code",
"execution_count": 13,
"metadata": {},
"outputs": [],
"source": [
"f_comm = open(\"pickle/commentaires_df.pickle\",\"rb\")\n",
"commentaires_df = pickle.load(f_comm)\n",
"f_comm.close()"
]
},
{
"cell_type": "code",
"execution_count": 23,
"metadata": {},
"outputs": [],
"source": [
"commentaires_df = commentaires_df[commentaires_df[\"media\"]!='CNN']"
]
},
{
"cell_type": "code",
"execution_count": 24,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>comment_id</th>\n",
" <th>nested_id</th>\n",
" <th>name</th>\n",
" <th>id</th>\n",
" <th>date</th>\n",
" <th>likes</th>\n",
" <th>comment</th>\n",
" <th>media</th>\n",
" <th>post_id</th>\n",
" <th>list_names</th>\n",
" <th>auteurs_referes</th>\n",
" <th>comment_clean</th>\n",
" <th>ner_dict</th>\n",
" <th>pos_dict</th>\n",
" <th>emoji_dict</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>1.0</td>\n",
" <td>0</td>\n",
" <td>Ycf Bullit</td>\n",
" <td>ID: 100000615866313</td>\n",
" <td>2019-11-09 14:17:13</td>\n",
" <td>0</td>\n",
" <td>C'est une blague mdr 不不不不不</td>\n",
" <td>FIG</td>\n",
" <td>5dc7ac7f359e2-10157143278136339</td>\n",
" <td>[Ycf Bullit]</td>\n",
" <td>[]</td>\n",
" <td>C'est une blague mdr 不不不不不</td>\n",
" <td>{}</td>\n",
" <td>{('est', 'VERB'): 1, ('blague', 'NOUN'): 1, ('...</td>\n",
" <td>{':rolling_on_the_floor_laughing:': [5, 6, 7]}</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>2.0</td>\n",
" <td>0</td>\n",
" <td>Steph Alcazar</td>\n",
" <td>ID: 100001175077263</td>\n",
" <td>2019-11-09 14:17:34</td>\n",
" <td>0</td>\n",
" <td>La seule question c'est de savoir s'il fera pl...</td>\n",
" <td>FIG</td>\n",
" <td>5dc7ac7f359e2-10157143278136339</td>\n",
" <td>[Steph Alcazar]</td>\n",
" <td>[]</td>\n",
" <td>La seule question c'est de savoir s'il fera pl...</td>\n",
" <td>{}</td>\n",
" <td>{('seule', 'ADJ'): 1, ('question', 'NOUN'): 1,...</td>\n",
" <td>{}</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>3.0</td>\n",
" <td>0</td>\n",
" <td>T繹m M羹st瓣ine</td>\n",
" <td>ID: 1365879404</td>\n",
" <td>2019-11-09 14:17:51</td>\n",
" <td>0</td>\n",
" <td>Romain Debrigode l info du jour qui fait plaise</td>\n",
" <td>FIG</td>\n",
" <td>5dc7ac7f359e2-10157143278136339</td>\n",
" <td>[T繹m M羹st瓣ine]</td>\n",
" <td>[]</td>\n",
" <td>Romain Debrigode l info du jour qui fait plaise</td>\n",
" <td>{('Romain', 'PERSON'): 1, ('Debrigode', 'PERSO...</td>\n",
" <td>{('Romain', 'PROPN'): 1, ('Debrigode', 'PROPN'...</td>\n",
" <td>{}</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>4.0</td>\n",
" <td>0</td>\n",
" <td>Pierre Crouzet</td>\n",
" <td>ID: 100000270292007</td>\n",
" <td>2019-11-09 14:18:06</td>\n",
" <td>0</td>\n",
" <td>Vasanth Toure </td>\n",
" <td>FIG</td>\n",
" <td>5dc7ac7f359e2-10157143278136339</td>\n",
" <td>[Pierre Crouzet, Vasanth Toure]</td>\n",
" <td>['Vasanth Toure']</td>\n",
" <td></td>\n",
" <td>{}</td>\n",
" <td>{}</td>\n",
" <td>{}</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4.0</td>\n",
" <td>1</td>\n",
" <td>Vasanth Toure</td>\n",
" <td>ID: 100001494607801</td>\n",
" <td>2019-11-09 14:20:57</td>\n",
" <td>0</td>\n",
" <td>Pierre Crouzet Paris n'est pas pr礙t encore...</td>\n",
" <td>FIG</td>\n",
" <td>5dc7ac7f359e2-10157143278136339</td>\n",
" <td>[Pierre Crouzet, Vasanth Toure]</td>\n",
" <td>['Pierre Crouzet']</td>\n",
" <td>Paris n'est pas pr礙t encore...</td>\n",
" <td>{('Paris', 'LOCATION'): 1}</td>\n",
" <td>{('Paris', 'PROPN'): 1, ('est', 'VERB'): 1, ('...</td>\n",
" <td>{}</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" comment_id nested_id name id \\\n",
"0 1.0 0 Ycf Bullit ID: 100000615866313 \n",
"1 2.0 0 Steph Alcazar ID: 100001175077263 \n",
"2 3.0 0 T繹m M羹st瓣ine ID: 1365879404 \n",
"3 4.0 0 Pierre Crouzet ID: 100000270292007 \n",
"4 4.0 1 Vasanth Toure ID: 100001494607801 \n",
"\n",
" date likes \\\n",
"0 2019-11-09 14:17:13 0 \n",
"1 2019-11-09 14:17:34 0 \n",
"2 2019-11-09 14:17:51 0 \n",
"3 2019-11-09 14:18:06 0 \n",
"4 2019-11-09 14:20:57 0 \n",
"\n",
" comment media \\\n",
"0 C'est une blague mdr 不不不不不 FIG \n",
"1 La seule question c'est de savoir s'il fera pl... FIG \n",
"2 Romain Debrigode l info du jour qui fait plaise FIG \n",
"3 Vasanth Toure FIG \n",
"4 Pierre Crouzet Paris n'est pas pr礙t encore... FIG \n",
"\n",
" post_id list_names \\\n",
"0 5dc7ac7f359e2-10157143278136339 [Ycf Bullit] \n",
"1 5dc7ac7f359e2-10157143278136339 [Steph Alcazar] \n",
"2 5dc7ac7f359e2-10157143278136339 [T繹m M羹st瓣ine] \n",
"3 5dc7ac7f359e2-10157143278136339 [Pierre Crouzet, Vasanth Toure] \n",
"4 5dc7ac7f359e2-10157143278136339 [Pierre Crouzet, Vasanth Toure] \n",
"\n",
" auteurs_referes comment_clean \\\n",
"0 [] C'est une blague mdr 不不不不不 \n",
"1 [] La seule question c'est de savoir s'il fera pl... \n",
"2 [] Romain Debrigode l info du jour qui fait plaise \n",
"3 ['Vasanth Toure'] \n",
"4 ['Pierre Crouzet'] Paris n'est pas pr礙t encore... \n",
"\n",
" ner_dict \\\n",
"0 {} \n",
"1 {} \n",
"2 {('Romain', 'PERSON'): 1, ('Debrigode', 'PERSO... \n",
"3 {} \n",
"4 {('Paris', 'LOCATION'): 1} \n",
"\n",
" pos_dict \\\n",
"0 {('est', 'VERB'): 1, ('blague', 'NOUN'): 1, ('... \n",
"1 {('seule', 'ADJ'): 1, ('question', 'NOUN'): 1,... \n",
"2 {('Romain', 'PROPN'): 1, ('Debrigode', 'PROPN'... \n",
"3 {} \n",
"4 {('Paris', 'PROPN'): 1, ('est', 'VERB'): 1, ('... \n",
"\n",
" emoji_dict \n",
"0 {':rolling_on_the_floor_laughing:': [5, 6, 7]} \n",
"1 {} \n",
"2 {} \n",
"3 {} \n",
"4 {} "
]
},
"execution_count": 24,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"commentaires_df.head()"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"## Description des corpus"
]
},
{
"cell_type": "code",
"execution_count": 16,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>post_id</th>\n",
" <th>text</th>\n",
" <th>ner_dict</th>\n",
" <th>pos_dict</th>\n",
" </tr>\n",
" <tr>\n",
" <th>media</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>FIG</th>\n",
" <td>25</td>\n",
" <td>25</td>\n",
" <td>25</td>\n",
" <td>25</td>\n",
" </tr>\n",
" <tr>\n",
" <th>RC</th>\n",
" <td>22</td>\n",
" <td>22</td>\n",
" <td>22</td>\n",
" <td>22</td>\n",
" </tr>\n",
" <tr>\n",
" <th>TVA</th>\n",
" <td>24</td>\n",
" <td>24</td>\n",
" <td>24</td>\n",
" <td>24</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" post_id text ner_dict pos_dict\n",
"media \n",
"FIG 25 25 25 25\n",
"RC 22 22 22 22\n",
"TVA 24 24 24 24"
]
},
"execution_count": 16,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"textes_articles_df.groupby(\"media\").count()"
]
},
{
"cell_type": "code",
"execution_count": 25,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>comment_id</th>\n",
" <th>nested_id</th>\n",
" <th>name</th>\n",
" <th>id</th>\n",
" <th>date</th>\n",
" <th>likes</th>\n",
" <th>comment</th>\n",
" <th>post_id</th>\n",
" <th>list_names</th>\n",
" <th>auteurs_referes</th>\n",
" <th>comment_clean</th>\n",
" <th>ner_dict</th>\n",
" <th>pos_dict</th>\n",
" <th>emoji_dict</th>\n",
" </tr>\n",
" <tr>\n",
" <th>media</th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" <th></th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>FIG</th>\n",
" <td>7155</td>\n",
" <td>7155</td>\n",
" <td>7155</td>\n",
" <td>7155</td>\n",
" <td>7155</td>\n",
" <td>7155</td>\n",
" <td>7031</td>\n",
" <td>7155</td>\n",
" <td>7155</td>\n",
" <td>7155</td>\n",
" <td>7155</td>\n",
" <td>7155</td>\n",
" <td>7155</td>\n",
" <td>7155</td>\n",
" </tr>\n",
" <tr>\n",
" <th>RC</th>\n",
" <td>3947</td>\n",
" <td>3947</td>\n",
" <td>3947</td>\n",
" <td>3947</td>\n",
" <td>3947</td>\n",
" <td>3947</td>\n",
" <td>3905</td>\n",
" <td>3947</td>\n",
" <td>3947</td>\n",
" <td>3947</td>\n",
" <td>3947</td>\n",
" <td>3947</td>\n",
" <td>3947</td>\n",
" <td>3947</td>\n",
" </tr>\n",
" <tr>\n",
" <th>TVA</th>\n",
" <td>6262</td>\n",
" <td>6262</td>\n",
" <td>6262</td>\n",
" <td>6262</td>\n",
" <td>6262</td>\n",
" <td>6262</td>\n",
" <td>6160</td>\n",
" <td>6262</td>\n",
" <td>6262</td>\n",
" <td>6262</td>\n",
" <td>6262</td>\n",
" <td>6262</td>\n",
" <td>6262</td>\n",
" <td>6262</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" comment_id nested_id name id date likes comment post_id \\\n",
"media \n",
"FIG 7155 7155 7155 7155 7155 7155 7031 7155 \n",
"RC 3947 3947 3947 3947 3947 3947 3905 3947 \n",
"TVA 6262 6262 6262 6262 6262 6262 6160 6262 \n",
"\n",
" list_names auteurs_referes comment_clean ner_dict pos_dict \\\n",
"media \n",
"FIG 7155 7155 7155 7155 7155 \n",
"RC 3947 3947 3947 3947 3947 \n",
"TVA 6262 6262 6262 6262 6262 \n",
"\n",
" emoji_dict \n",
"media \n",
"FIG 7155 \n",
"RC 3947 \n",
"TVA 6262 "
]
},
"execution_count": 25,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"commentaires_df.groupby(\"media\").count()"
]
},
{
"cell_type": "code",
"execution_count": 35,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"17364"
]
},
"execution_count": 35,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nb_comm = commentaires_df[\"emoji_dict\"].count()\n",
"nb_comm"
]
},
{
"cell_type": "code",
"execution_count": 36,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"2204"
]
},
"execution_count": 36,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nb_comm_emoji = sum(commentaires_df[\"emoji_dict\"].apply(lambda x: len(x)) == 1)\n",
"nb_comm_emoji"
]
},
{
"cell_type": "code",
"execution_count": 38,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"0.12692927896797973"
]
},
"execution_count": 38,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"nb_comm_emoji/nb_comm"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}