Browse Source

commit initial

master
FRANCOIS PELLETIER 1 year ago
parent
commit
a95f89a69d
8 changed files with 3008 additions and 0 deletions
  1. +4
    -0
      .gitignore
  2. +261
    -0
      Commentaires sur les réseaux sociaux.mm
  3. +188
    -0
      Traitement Articles.ipynb
  4. +2036
    -0
      Traitement commentaires.ipynb
  5. +88
    -0
      commentaires.ipynb
  6. +261
    -0
      commentaires_reseaux_sociaux.mm
  7. +57
    -0
      parsing_functions.py
  8. +113
    -0
      textes_articles.ipynb

+ 4
- 0
.gitignore View File

@@ -1,3 +1,7 @@
data/
pickle/
refined_data/

# ---> Python
# Byte-compiled / optimized / DLL files
__pycache__/


+ 261
- 0
Commentaires sur les réseaux sociaux.mm View File

@@ -0,0 +1,261 @@
<map version="freeplane 1.7.0">
<!--To view this file, download free mind mapping software Freeplane from http://freeplane.sourceforge.net -->
<node TEXT="Commentaires sur les r&#xe9;seaux sociaux" FOLDED="false" ID="ID_1075161201" CREATED="1573954855044" MODIFIED="1573954867944" STYLE="oval">
<font SIZE="18"/>
<hook NAME="MapStyle">
<properties edgeColorConfiguration="#808080ff,#ff0000ff,#0000ffff,#00ff00ff,#ff00ffff,#00ffffff,#7c0000ff,#00007cff,#007c00ff,#7c007cff,#007c7cff,#7c7c00ff" fit_to_viewport="false"/>

<map_styles>
<stylenode LOCALIZED_TEXT="styles.root_node" STYLE="oval" UNIFORM_SHAPE="true" VGAP_QUANTITY="24.0 pt">
<font SIZE="24"/>
<stylenode LOCALIZED_TEXT="styles.predefined" POSITION="right" STYLE="bubble">
<stylenode LOCALIZED_TEXT="default" ICON_SIZE="12.0 pt" COLOR="#000000" STYLE="fork">
<font NAME="SansSerif" SIZE="10" BOLD="false" ITALIC="false"/>
</stylenode>
<stylenode LOCALIZED_TEXT="defaultstyle.details"/>
<stylenode LOCALIZED_TEXT="defaultstyle.attributes">
<font SIZE="9"/>
</stylenode>
<stylenode LOCALIZED_TEXT="defaultstyle.note" COLOR="#000000" BACKGROUND_COLOR="#ffffff" TEXT_ALIGN="LEFT"/>
<stylenode LOCALIZED_TEXT="defaultstyle.floating">
<edge STYLE="hide_edge"/>
<cloud COLOR="#f0f0f0" SHAPE="ROUND_RECT"/>
</stylenode>
</stylenode>
<stylenode LOCALIZED_TEXT="styles.user-defined" POSITION="right" STYLE="bubble">
<stylenode LOCALIZED_TEXT="styles.topic" COLOR="#18898b" STYLE="fork">
<font NAME="Liberation Sans" SIZE="10" BOLD="true"/>
</stylenode>
<stylenode LOCALIZED_TEXT="styles.subtopic" COLOR="#cc3300" STYLE="fork">
<font NAME="Liberation Sans" SIZE="10" BOLD="true"/>
</stylenode>
<stylenode LOCALIZED_TEXT="styles.subsubtopic" COLOR="#669900">
<font NAME="Liberation Sans" SIZE="10" BOLD="true"/>
</stylenode>
<stylenode LOCALIZED_TEXT="styles.important">
<icon BUILTIN="yes"/>
</stylenode>
</stylenode>
<stylenode LOCALIZED_TEXT="styles.AutomaticLayout" POSITION="right" STYLE="bubble">
<stylenode LOCALIZED_TEXT="AutomaticLayout.level.root" COLOR="#000000" STYLE="oval" SHAPE_HORIZONTAL_MARGIN="10.0 pt" SHAPE_VERTICAL_MARGIN="10.0 pt">
<font SIZE="18"/>
</stylenode>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,1" COLOR="#0033ff">
<font SIZE="16"/>
</stylenode>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,2" COLOR="#00b439">
<font SIZE="14"/>
</stylenode>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,3" COLOR="#990000">
<font SIZE="12"/>
</stylenode>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,4" COLOR="#111111">
<font SIZE="10"/>
</stylenode>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,5"/>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,6"/>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,7"/>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,8"/>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,9"/>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,10"/>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,11"/>
</stylenode>
</stylenode>
</map_styles>
</hook>
<hook NAME="AutomaticEdgeColor" COUNTER="6" RULE="ON_BRANCH_CREATION"/>
<node TEXT="Schultes" POSITION="left" ID="ID_1889975585" CREATED="1573955304008" MODIFIED="1573955306512">
<edge COLOR="#00ff00"/>
<node TEXT="Classification Youtube" ID="ID_960774191" CREATED="1573954869370" MODIFIED="1573955309655">
<node TEXT="&#xc9;tude" ID="ID_1621340516" CREATED="1573955028436" MODIFIED="1573955040248">
<node TEXT="Aggressif: 42%" ID="ID_221633146" CREATED="1573954944708" MODIFIED="1573954950067"/>
<node TEXT="Essentiels: 6%" ID="ID_1440746951" CREATED="1573954967031" MODIFIED="1573954981297"/>
<node TEXT="Stupide: 51%" ID="ID_921312086" CREATED="1573954957415" MODIFIED="1573954966495"/>
<node TEXT="Non pertinents: 64%" ID="ID_1705312749" CREATED="1573954934920" MODIFIED="1573954944172"/>
</node>
<node TEXT="The Guardian 2009" ID="ID_352535949" CREATED="1573955053842" MODIFIED="1573955061455">
<node TEXT="Juvenile" ID="ID_1646713003" CREATED="1573955061461" MODIFIED="1573955078889"/>
<node TEXT="Aggressive" ID="ID_862048526" CREATED="1573955079074" MODIFIED="1573955081407"/>
<node TEXT="Misspelled" ID="ID_286709909" CREATED="1573955081602" MODIFIED="1573955087293"/>
<node TEXT="Sexist" ID="ID_153876225" CREATED="1573955087779" MODIFIED="1573955092833"/>
</node>
</node>
<node TEXT="Pourquoi commenter?" ID="ID_866509994" CREATED="1573955165314" MODIFIED="1573955312129">
<node TEXT="12% commentent" ID="ID_1041837989" CREATED="1573955193866" MODIFIED="1573955202502"/>
<node TEXT="34% lisent les commentaires" ID="ID_1372530703" CREATED="1573955232869" MODIFIED="1573955240608"/>
<node TEXT="53% regardent les 2-3 premiers commentaires" ID="ID_1662261189" CREATED="1573955241001" MODIFIED="1573955255585"/>
</node>
<node TEXT="Classification en deux &#xe9;tapes" ID="ID_1588097956" CREATED="1573955358766" MODIFIED="1573955365555">
<node TEXT="Type et qualit&#xe9;" ID="ID_1894181819" CREATED="1573955376045" MODIFIED="1573955381723">
<node TEXT="La distribution est pertinente pour d&#xe9;crire un vid&#xe9;o" ID="ID_993533082" CREATED="1573955400296" MODIFIED="1573955411700"/>
<node TEXT="Lien direct avec les &quot;Like&quot;, donc la rentabilit&#xe9; du vid&#xe9;o pour Youtube" ID="ID_195371144" CREATED="1573955596788" MODIFIED="1573955616313">
<node TEXT="Mod&#xe9;lisation R&#xe9;gression Neg. Binomiale" ID="ID_218766081" CREATED="1573956262222" MODIFIED="1573956280088"/>
</node>
<node TEXT="Types" ID="ID_868118585" CREATED="1573955883446" MODIFIED="1573955884542">
<node TEXT="t1: Discussion" ID="ID_1000797050" CREATED="1573955777851" MODIFIED="1573955783201"/>
<node TEXT="t3: substantiels" ID="ID_1205292896" CREATED="1573955790350" MODIFIED="1573955794297"/>
<node TEXT="t2: inf&#xe9;rieurs" ID="ID_392602295" CREATED="1573955783999" MODIFIED="1573955789685"/>
</node>
<node TEXT="Outils" ID="ID_144483136" CREATED="1573955937321" MODIFIED="1573955941603">
<node TEXT="Offensive: SentiStrength" ID="ID_1491683881" CREATED="1573955828111" MODIFIED="1573955833421"/>
<node TEXT="Liste manuelle de marqueurs &#xe9;motionnels" ID="ID_217513801" CREATED="1573955864375" MODIFIED="1573955877200"/>
<node TEXT="Appariment de mots-cl&#xe9;s et du titre" ID="ID_1268991488" CREATED="1573955952300" MODIFIED="1573955975377"/>
</node>
<node TEXT="" ID="ID_667449169" CREATED="1573955947623" MODIFIED="1573955947623"/>
</node>
<node TEXT="Permettent l&apos;analyse s&#xe9;mantique du vid&#xe9;o" ID="ID_1291677418" CREATED="1573955433090" MODIFIED="1573955442246">
<node TEXT="Mod&#xe9;lisation avec un SVM" ID="ID_1415408576" CREATED="1573956080194" MODIFIED="1573956086398">
<node TEXT="Variable r&#xe9;ponse: cat&#xe9;gorie du vid&#xe9;o (News, Sports, Music, ...)" ID="ID_92620342" CREATED="1573956116685" MODIFIED="1573956166425"/>
<node TEXT="Caract&#xe9;ristiques: Type et qualit&#xe9; des commentaires" ID="ID_575459159" CREATED="1573956125581" MODIFIED="1573956141215"/>
</node>
<node TEXT="" ID="ID_1244708423" CREATED="1573956216954" MODIFIED="1573956259964"/>
</node>
</node>
<node TEXT="Inspiration de Ammari et al." ID="ID_473432831" CREATED="1573955655134" MODIFIED="1573955845106">
<node TEXT="Identifier les commentaires &quot;noisy&quot;" ID="ID_156080744" CREATED="1573955669957" MODIFIED="1573955686377"/>
</node>
<node TEXT="https://www.semanticscholar.org/paper/Leave-a-Comment!-An-In-Depth-Analysis-of-User-on-Schultes-Dorner/d84ec961f13ebc56bd45f63ac78a6e07bbba2a63" ID="ID_779661135" CREATED="1573956897563" MODIFIED="1573956898796"/>
</node>
<node TEXT="Impl&#xe9;mentation" POSITION="right" ID="ID_603062380" CREATED="1573956867473" MODIFIED="1573956871207">
<edge COLOR="#ff00ff"/>
<node TEXT="https://zablo.net/blog/post/twitter-sentiment-analysis-python-scikit-word2vec-nltk-xgboost/index.html" ID="ID_646162134" CREATED="1573956872038" MODIFIED="1573956872849"/>
</node>
<node TEXT="Halt&#xe9; - Emoticones" POSITION="right" ID="ID_1763549449" CREATED="1575238775212" MODIFIED="1575238785034">
<edge COLOR="#00ffff"/>
<node TEXT="Emoticones" ID="ID_386506934" CREATED="1575238883772" MODIFIED="1575238887445">
<node TEXT="Mim&#xe9;tique et gestuelle" ID="ID_1631325076" CREATED="1575239148209" MODIFIED="1575239154376"/>
<node TEXT="Plus fort que les interjections" ID="ID_886136844" CREATED="1575240727401" MODIFIED="1575240735333"/>
<node TEXT="Syst&#xe8;me d&apos;&#xe9;criture" ID="ID_525765510" CREATED="1575239409720" MODIFIED="1575239420069">
<node TEXT="R&#xe8;gles" ID="ID_1483343948" CREATED="1575239421113" MODIFIED="1575239424034"/>
<node TEXT="Oppositions" ID="ID_1849414542" CREATED="1575239424924" MODIFIED="1575239427759"/>
<node TEXT="Usages" ID="ID_1764780335" CREATED="1575239428189" MODIFIED="1575239429883"/>
</node>
<node TEXT="Emojis" FOLDED="true" ID="ID_838340950" CREATED="1575242128366" MODIFIED="1575242133912">
<node TEXT="Banque normalis&#xe9;e" ID="ID_1322377772" CREATED="1575242133919" MODIFIED="1575242238282">
<node TEXT="Verbal" ID="ID_1610044649" CREATED="1575242238977" MODIFIED="1575242243717">
<node TEXT="Modalit&#xe9;" ID="ID_1389558503" CREATED="1575242266230" MODIFIED="1575242268798"/>
</node>
<node TEXT="Non-verbal" FOLDED="true" ID="ID_430063472" CREATED="1575242244105" MODIFIED="1575242250451">
<node TEXT="Objets" ID="ID_900313374" CREATED="1575242250461" MODIFIED="1575242253644"/>
<node TEXT="Actions" ID="ID_1522816072" CREATED="1575242254098" MODIFIED="1575242264462"/>
</node>
<node TEXT="Diff&#xe9;rentes parties du langage (Pierce)" ID="ID_1811986887" CREATED="1575242392026" MODIFIED="1575242450835">
<node TEXT="Iconique" ID="ID_1371581343" CREATED="1575242421848" MODIFIED="1575242424675"/>
<node TEXT="Indiciel" ID="ID_213205468" CREATED="1575242425224" MODIFIED="1575242427614"/>
<node TEXT="Symbolique" ID="ID_349389975" CREATED="1575242428002" MODIFIED="1575242430884"/>
</node>
</node>
</node>
<node TEXT="Provine et al. Le sens de l&apos;&#xe9;moticone est une information additionnelle ou compl&#xe9;mentaire au message" ID="ID_790078370" CREATED="1575245786150" MODIFIED="1575245837675">
<node TEXT="Nues" ID="ID_1650267953" CREATED="1575245860851" MODIFIED="1575245863060"/>
<node TEXT="D&#xe9;but ou fin (+ fr&#xe9;quent)" ID="ID_870280863" CREATED="1575245863647" MODIFIED="1575245884155"/>
<node TEXT="Int&#xe9;rieures" ID="ID_37382522" CREATED="1575245868733" MODIFIED="1575245871544"/>
</node>
</node>
<node TEXT="Interjections et sigles" FOLDED="true" ID="ID_818337825" CREATED="1575238887904" MODIFIED="1575238893112">
<node TEXT="Attitude subjective" ID="ID_1458901294" CREATED="1575238961067" MODIFIED="1575238969600"/>
<node TEXT="Remplace les gestes, mimiques, intonations" ID="ID_1708771216" CREATED="1575238786167" MODIFIED="1575238882360"/>
<node TEXT="montrer plut&#xf4;t que dire" ID="ID_1458650089" CREATED="1575239025028" MODIFIED="1575239031427"/>
</node>
<node TEXT="Le tchat" ID="ID_972070502" CREATED="1575240740283" MODIFIED="1575240747317">
<node TEXT="Parfois synchrone, parfois non." ID="ID_1205295086" CREATED="1575240748630" MODIFIED="1575240779677"/>
<node TEXT="Indices contextuels forts" ID="ID_77869570" CREATED="1575240780110" MODIFIED="1575240786499"/>
<node TEXT="Tours de paroles segment&#xe9;s" ID="ID_971520480" CREATED="1575240805520" MODIFIED="1575240878843"/>
<node TEXT="Conversations entrelac&#xe9;es" ID="ID_464582331" CREATED="1575240929788" MODIFIED="1575240941056">
<node TEXT="Quidproquo" ID="ID_1623621173" CREATED="1575240943203" MODIFIED="1575240948763"/>
<node TEXT="Situations humoristiques" ID="ID_816264224" CREATED="1575240949163" MODIFIED="1575240953649"/>
</node>
<node TEXT="&#xc9;moticone: Port&#xe9;e variable" ID="ID_94048404" CREATED="1575260093511" MODIFIED="1575260103300"/>
<node TEXT="&#xc9;nonc&#xe9;s sur plusieurs lignes, s&#xe9;parations syntaxiques ou non" ID="ID_1730871314" CREATED="1575260541509" MODIFIED="1575260562710"/>
<node TEXT="Prise en compte/prise en charge" ID="ID_266160815" CREATED="1575260407487" MODIFIED="1575260417401">
<node TEXT="L&apos;&#xe9;moticone permet de ne pas prendre position, mais de montrer qu&apos;on a bien re&#xe7;u ce qui a &#xe9;t&#xe9; dit" ID="ID_362215456" CREATED="1575260417403" MODIFIED="1575260451519"/>
</node>
</node>
</node>
<node TEXT="Georgalou - Discourse and identity on Facebook" POSITION="left" ID="ID_573961500" CREATED="1576432501896" MODIFIED="1576439987458">
<edge COLOR="#7c0000"/>
<node TEXT="Nouveaux &#xe9;l&#xe9;ments du langage" ID="ID_884645106" CREATED="1576432511262" MODIFIED="1576432547779">
<node TEXT="Ponctuations multiples ?!" ID="ID_1995619041" CREATED="1576432549422" MODIFIED="1576437103533"/>
<node TEXT="Interjections" ID="ID_607633784" CREATED="1576437113783" MODIFIED="1576437117640"/>
<node TEXT="Majuscules" ID="ID_1383386533" CREATED="1576437118071" MODIFIED="1576437122120"/>
</node>
<node TEXT="Analyse du discours (Baxter 2010)" ID="ID_1424801803" CREATED="1576432904446" MODIFIED="1576433023929">
<node TEXT="Variabilit&#xe9;" ID="ID_1655319920" CREATED="1576432921042" MODIFIED="1576432935538">
<node TEXT="Audience" ID="ID_1287238103" CREATED="1576432935545" MODIFIED="1576432942327"/>
<node TEXT="Contexte" ID="ID_1017431291" CREATED="1576432943825" MODIFIED="1576432954803"/>
</node>
<node TEXT="Nature du langage" ID="ID_397305792" CREATED="1576432966755" MODIFIED="1576432977141">
<node TEXT="Description" ID="ID_950767559" CREATED="1576432978593" MODIFIED="1576432983163"/>
<node TEXT="Narration" ID="ID_868814925" CREATED="1576432983797" MODIFIED="1576432987481"/>
<node TEXT="Remarques" ID="ID_877450704" CREATED="1576432988675" MODIFIED="1576432996039"/>
<node TEXT="Commentaires" ID="ID_1281731741" CREATED="1576432996663" MODIFIED="1576433001128"/>
<node TEXT="Blagues" ID="ID_1309797672" CREATED="1576433001508" MODIFIED="1576433003773"/>
</node>
<node TEXT="R&#xe9;pertoire" ID="ID_1024196052" CREATED="1576433048058" MODIFIED="1576433050691">
<node TEXT="Vocabulaire" ID="ID_709732839" CREATED="1576433053478" MODIFIED="1576433057503"/>
<node TEXT="Grammaire" ID="ID_736734389" CREATED="1576433058284" MODIFIED="1576433066147"/>
<node TEXT="Figures de style" ID="ID_137013127" CREATED="1576433075268" MODIFIED="1576433080277"/>
</node>
<node TEXT="Approche" ID="ID_1561649830" CREATED="1576433106369" MODIFIED="1576433112084">
<node TEXT="Contexte psychologique" ID="ID_1939058865" CREATED="1576433112090" MODIFIED="1576433123678"/>
<node TEXT="Contexte sociopolitique" ID="ID_791737082" CREATED="1576433124443" MODIFIED="1576433130862"/>
</node>
</node>
<node TEXT="&#xc9;l&#xe9;ments d&apos;analyse" ID="ID_994837948" CREATED="1576433185944" MODIFIED="1576433207370">
<node TEXT="Intertextualit&#xe9;" ID="ID_1353601895" CREATED="1576433212711" MODIFIED="1576433223175">
<node TEXT="Liens avec les textes pr&#xe9;c&#xe9;dents" ID="ID_261026690" CREATED="1576433278055" MODIFIED="1576433286703"/>
</node>
<node TEXT="Interdiscursivit&#xe9;" ID="ID_1355846398" CREATED="1576433224084" MODIFIED="1576433232077">
<node TEXT="Interaction et superposition des &#xe9;changes" ID="ID_1296550423" CREATED="1576433265025" MODIFIED="1576433274898"/>
</node>
<node TEXT="Multimodalit&#xe9;" ID="ID_398609083" CREATED="1576433232685" MODIFIED="1576433253744">
<node TEXT="Images" ID="ID_996455266" CREATED="1576433241535" MODIFIED="1576433243920"/>
<node TEXT="Textes" ID="ID_1799781603" CREATED="1576433244372" MODIFIED="1576433245825"/>
<node TEXT="Vid&#xe9;os" ID="ID_421888374" CREATED="1576433246259" MODIFIED="1576433247808"/>
</node>
</node>
<node TEXT="Localisation" FOLDED="true" ID="ID_206247466" CREATED="1576433752181" MODIFIED="1576433841871">
<node TEXT="Textualisation" FOLDED="true" ID="ID_1774810454" CREATED="1576433842212" MODIFIED="1576433850744">
<node TEXT="&#xc9;l&#xe9;ment culturel" ID="ID_726017284" CREATED="1576433850747" MODIFIED="1576433859642"/>
</node>
<node TEXT="M&#xe9;tonymie" FOLDED="true" ID="ID_1450191917" CREATED="1576433862048" MODIFIED="1576433910984">
<node TEXT="Inclus la localisation, mais dans un autre type lexical" ID="ID_1282192414" CREATED="1576433915495" MODIFIED="1576433947584"/>
</node>
<node TEXT="Personnification" ID="ID_1888056104" CREATED="1576433968022" MODIFIED="1576433972282"/>
<node TEXT="S&#xe9;miotique transgressive" ID="ID_508849276" CREATED="1576434327152" MODIFIED="1576434342653">
<node TEXT="Signes qui ne vont pas ensemble, dans un m&#xea;me discours. Pour marquer l&apos;opposition implicite" ID="ID_1864195644" CREATED="1576434347915" MODIFIED="1576434379898"/>
</node>
</node>
<node TEXT="Temps" FOLDED="true" ID="ID_1259089697" CREATED="1576436972135" MODIFIED="1576436974049">
<node TEXT="Temporalit&#xe9;" ID="ID_935107929" CREATED="1576436975051" MODIFIED="1576436979169"/>
<node TEXT="Notion de maintenant" ID="ID_63573093" CREATED="1576436979945" MODIFIED="1576436985093"/>
<node TEXT="Cycles" ID="ID_1458339480" CREATED="1576437010439" MODIFIED="1576437014557"/>
<node TEXT="Saisons, f&#xea;tes" ID="ID_566591651" CREATED="1576437044816" MODIFIED="1576437049959"/>
<node TEXT="Pass&#xe9; et futur" ID="ID_1383560062" CREATED="1576437051190" MODIFIED="1576437056318"/>
<node TEXT="&#xc2;ge, anniversaires" ID="ID_1886330627" CREATED="1576437056834" MODIFIED="1576437061864"/>
</node>
<node TEXT="&#xc9;ducation et expertise" FOLDED="true" ID="ID_869848700" CREATED="1576437570827" MODIFIED="1576437579348">
<node TEXT="Montrer son expertise" ID="ID_1657747338" CREATED="1576437580284" MODIFIED="1576437611763"/>
<node TEXT="Se r&#xe9;clamer le droit de mener la discussion (entitlement)" ID="ID_1490384821" CREATED="1576437612334" MODIFIED="1576437625926"/>
<node TEXT="Montrer ses r&#xe9;ussites acad&#xe9;miques" ID="ID_1508946328" CREATED="1576437628508" MODIFIED="1576437637956"/>
<node TEXT="R&#xe9;f&#xe9;rences, imp&#xe9;ratifs, pr&#xe9;supposition, souhaits" ID="ID_718153801" CREATED="1576437668961" MODIFIED="1576437681186"/>
</node>
<node TEXT="Position" ID="ID_727200733" CREATED="1576438549864" MODIFIED="1576438559654">
<node TEXT="Expression" FOLDED="true" ID="ID_986508069" CREATED="1576438633543" MODIFIED="1576438637443">
<node TEXT="Attitude" ID="ID_274391283" CREATED="1576438566378" MODIFIED="1576438593868"/>
<node TEXT="&#xc9;motion" ID="ID_920344361" CREATED="1576438594326" MODIFIED="1576438596613"/>
<node TEXT="Croyance" ID="ID_592625157" CREATED="1576438596770" MODIFIED="1576438599820"/>
<node TEXT="&#xc9;valuation/jugement" ID="ID_717619617" CREATED="1576438600172" MODIFIED="1576438605956"/>
<node TEXT="Engagement" ID="ID_1367600286" CREATED="1576438606159" MODIFIED="1576438609780"/>
</node>
<node TEXT="Attributs linguistiques" ID="ID_825473217" CREATED="1576438657794" MODIFIED="1576438663960">
<node TEXT="Modalit&#xe9;" ID="ID_197540513" CREATED="1576438664912" MODIFIED="1576438672368"/>
<node TEXT="&#xc9;valuation" ID="ID_1335244587" CREATED="1576438672980" MODIFIED="1576438676694"/>
<node TEXT="Politesse" ID="ID_247379549" CREATED="1576438698682" MODIFIED="1576438703989"/>
<node TEXT="&#xc9;videntialit&#xe9;" ID="ID_1818362871" CREATED="1576438704862" MODIFIED="1576438708497"/>
<node TEXT="Intensit&#xe9;" ID="ID_517078263" CREATED="1576438742226" MODIFIED="1576438746807"/>
</node>
</node>
</node>
</node>
</map>

+ 188
- 0
Traitement Articles.ipynb View File

@@ -0,0 +1,188 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 153,
"metadata": {},
"outputs": [],
"source": [
"import pandas as pd\n",
"import numpy as np\n",
"from nltk.corpus import stopwords\n",
"from nltk.tokenize import toktok, sent_tokenize\n",
"from nltk.parse import CoreNLPParser\n",
"import re\n",
"import pickle"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"tok = toktok.ToktokTokenizer()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"pos_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='pos')"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"ner_tagger = CoreNLPParser(url='http://localhost:9000', tagtype='ner')\n",
"#https://github.com/nltk/nltk/wiki/Stanford-CoreNLP-API-in-NLTK"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"textes_articles_df = pd.read_csv(\"refined_data/textes_articles_df.csv\")\n",
"textes_articles_df = textes_articles_df[textes_articles_df[\"text\"].notnull() & (textes_articles_df[\"media\"]!='CNN')]"
]
},
{
"cell_type": "code",
"execution_count": 163,
"metadata": {},
"outputs": [],
"source": [
"del textes_articles_df['Unnamed: 0']"
]
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"# Prétraitement\n",
"def pretraitement(article):\n",
" # tokeniser par phrases\n",
" article_sentences = sent_tokenize(article)\n",
" article_ner_tokens = []\n",
" article_pos_tokens = []\n",
" compteur_phrase = 0\n",
" for sentence in article_sentences:\n",
" # Tokeniser\n",
" sentence_tokens = tok.tokenize(sentence)\n",
" # Assembler les entités nommées et colocations\n",
" sentence_ner = ner_tagger.tag(sentence_tokens)\n",
" ner_tokens = [ner_token for ner_token in sentence_ner if ner_token[1] != 'O']\n",
" # Supprimer les classes fermées avec un POS\n",
" sentence_pos = pos_tagger.tag(sentence_tokens)\n",
" pos_tokens = [pos_token for pos_token in sentence_pos if pos_token[1] in ['ADJ','ADV','INTJ','NOUN','PROPN','VERB']]\n",
" # Ajouter à la liste de phrases tokenisées\n",
" article_ner_tokens.append(ner_tokens)\n",
" article_pos_tokens.append(pos_tokens)\n",
" compteur_phrase += 1\n",
" return article_ner_tokens, article_pos_tokens"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"article_pretraite = [pretraitement(x) for x in list(textes_articles_df[\"text\"])]"
]
},
{
"cell_type": "code",
"execution_count": 131,
"metadata": {},
"outputs": [],
"source": [
"def aggreger_ner_tags(article):\n",
" dict_named_entity = {}\n",
" for sentence in article[0]:\n",
" for entity in sentence:\n",
" dict_named_entity[entity] = dict_named_entity.get(entity,0) + 1\n",
" return dict_named_entity"
]
},
{
"cell_type": "code",
"execution_count": 132,
"metadata": {},
"outputs": [],
"source": [
"def aggreger_pos_tags(article):\n",
" dict_pos = {}\n",
" for sentence in article[1]:\n",
" for pos in sentence:\n",
" dict_pos[pos] = dict_pos.get(pos,0) + 1\n",
" return dict_pos"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 165,
"metadata": {},
"outputs": [],
"source": [
"textes_articles_df['ner_dict']=[aggreger_ner_tags(article) for article in article_pretraite]\n",
"textes_articles_df['pos_dict']=[aggreger_pos_tags(article) for article in article_pretraite]"
]
},
{
"cell_type": "code",
"execution_count": 167,
"metadata": {},
"outputs": [],
"source": [
"f = open(\"pickle/textes_articles_df.pickle\",\"wb\")\n",
"pickle.dump(textes_articles_df,f)\n",
"f.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

+ 2036
- 0
Traitement commentaires.ipynb
File diff suppressed because it is too large
View File


+ 88
- 0
commentaires.ipynb View File

@@ -0,0 +1,88 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import parsing_functions as pf\n",
"import re\n",
"import pandas as pd\n",
"import time"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"listOfFiles = pf.getListOfFiles(\"data\")"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [],
"source": [
"commentaires = []\n",
"\n",
"for xlpath in listOfFiles:\n",
" comments_df = []\n",
" media, post_id = re.match(r\"data/([A-Z]+)/comments([0-9a-z\\-]+)\\.xlsx\",xlpath).groups()\n",
" comments_df = pf.get_comments(xlpath)\n",
" comments_df['media']=media\n",
" comments_df['post_id']=post_id\n",
" commentaires.append(comments_df)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"commentaires_df = pd.concat(commentaires, ignore_index=True)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"commentaires_df.to_csv(\"refined_data/commentaires_df.csv\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

+ 261
- 0
commentaires_reseaux_sociaux.mm View File

@@ -0,0 +1,261 @@
<map version="freeplane 1.7.0">
<!--To view this file, download free mind mapping software Freeplane from http://freeplane.sourceforge.net -->
<node TEXT="Commentaires sur les r&#xe9;seaux sociaux" FOLDED="false" ID="ID_1075161201" CREATED="1573954855044" MODIFIED="1573954867944" STYLE="oval">
<font SIZE="18"/>
<hook NAME="MapStyle">
<properties edgeColorConfiguration="#808080ff,#ff0000ff,#0000ffff,#00ff00ff,#ff00ffff,#00ffffff,#7c0000ff,#00007cff,#007c00ff,#7c007cff,#007c7cff,#7c7c00ff" fit_to_viewport="false"/>

<map_styles>
<stylenode LOCALIZED_TEXT="styles.root_node" STYLE="oval" UNIFORM_SHAPE="true" VGAP_QUANTITY="24.0 pt">
<font SIZE="24"/>
<stylenode LOCALIZED_TEXT="styles.predefined" POSITION="right" STYLE="bubble">
<stylenode LOCALIZED_TEXT="default" ICON_SIZE="12.0 pt" COLOR="#000000" STYLE="fork">
<font NAME="SansSerif" SIZE="10" BOLD="false" ITALIC="false"/>
</stylenode>
<stylenode LOCALIZED_TEXT="defaultstyle.details"/>
<stylenode LOCALIZED_TEXT="defaultstyle.attributes">
<font SIZE="9"/>
</stylenode>
<stylenode LOCALIZED_TEXT="defaultstyle.note" COLOR="#000000" BACKGROUND_COLOR="#ffffff" TEXT_ALIGN="LEFT"/>
<stylenode LOCALIZED_TEXT="defaultstyle.floating">
<edge STYLE="hide_edge"/>
<cloud COLOR="#f0f0f0" SHAPE="ROUND_RECT"/>
</stylenode>
</stylenode>
<stylenode LOCALIZED_TEXT="styles.user-defined" POSITION="right" STYLE="bubble">
<stylenode LOCALIZED_TEXT="styles.topic" COLOR="#18898b" STYLE="fork">
<font NAME="Liberation Sans" SIZE="10" BOLD="true"/>
</stylenode>
<stylenode LOCALIZED_TEXT="styles.subtopic" COLOR="#cc3300" STYLE="fork">
<font NAME="Liberation Sans" SIZE="10" BOLD="true"/>
</stylenode>
<stylenode LOCALIZED_TEXT="styles.subsubtopic" COLOR="#669900">
<font NAME="Liberation Sans" SIZE="10" BOLD="true"/>
</stylenode>
<stylenode LOCALIZED_TEXT="styles.important">
<icon BUILTIN="yes"/>
</stylenode>
</stylenode>
<stylenode LOCALIZED_TEXT="styles.AutomaticLayout" POSITION="right" STYLE="bubble">
<stylenode LOCALIZED_TEXT="AutomaticLayout.level.root" COLOR="#000000" STYLE="oval" SHAPE_HORIZONTAL_MARGIN="10.0 pt" SHAPE_VERTICAL_MARGIN="10.0 pt">
<font SIZE="18"/>
</stylenode>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,1" COLOR="#0033ff">
<font SIZE="16"/>
</stylenode>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,2" COLOR="#00b439">
<font SIZE="14"/>
</stylenode>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,3" COLOR="#990000">
<font SIZE="12"/>
</stylenode>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,4" COLOR="#111111">
<font SIZE="10"/>
</stylenode>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,5"/>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,6"/>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,7"/>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,8"/>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,9"/>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,10"/>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,11"/>
</stylenode>
</stylenode>
</map_styles>
</hook>
<hook NAME="AutomaticEdgeColor" COUNTER="6" RULE="ON_BRANCH_CREATION"/>
<node TEXT="Schultes" POSITION="left" ID="ID_1889975585" CREATED="1573955304008" MODIFIED="1573955306512">
<edge COLOR="#00ff00"/>
<node TEXT="Classification Youtube" ID="ID_960774191" CREATED="1573954869370" MODIFIED="1573955309655">
<node TEXT="&#xc9;tude" ID="ID_1621340516" CREATED="1573955028436" MODIFIED="1573955040248">
<node TEXT="Aggressif: 42%" ID="ID_221633146" CREATED="1573954944708" MODIFIED="1573954950067"/>
<node TEXT="Essentiels: 6%" ID="ID_1440746951" CREATED="1573954967031" MODIFIED="1573954981297"/>
<node TEXT="Stupide: 51%" ID="ID_921312086" CREATED="1573954957415" MODIFIED="1573954966495"/>
<node TEXT="Non pertinents: 64%" ID="ID_1705312749" CREATED="1573954934920" MODIFIED="1573954944172"/>
</node>
<node TEXT="The Guardian 2009" ID="ID_352535949" CREATED="1573955053842" MODIFIED="1573955061455">
<node TEXT="Juvenile" ID="ID_1646713003" CREATED="1573955061461" MODIFIED="1573955078889"/>
<node TEXT="Aggressive" ID="ID_862048526" CREATED="1573955079074" MODIFIED="1573955081407"/>
<node TEXT="Misspelled" ID="ID_286709909" CREATED="1573955081602" MODIFIED="1573955087293"/>
<node TEXT="Sexist" ID="ID_153876225" CREATED="1573955087779" MODIFIED="1573955092833"/>
</node>
</node>
<node TEXT="Pourquoi commenter?" ID="ID_866509994" CREATED="1573955165314" MODIFIED="1573955312129">
<node TEXT="12% commentent" ID="ID_1041837989" CREATED="1573955193866" MODIFIED="1573955202502"/>
<node TEXT="34% lisent les commentaires" ID="ID_1372530703" CREATED="1573955232869" MODIFIED="1573955240608"/>
<node TEXT="53% regardent les 2-3 premiers commentaires" ID="ID_1662261189" CREATED="1573955241001" MODIFIED="1573955255585"/>
</node>
<node TEXT="Classification en deux &#xe9;tapes" ID="ID_1588097956" CREATED="1573955358766" MODIFIED="1573955365555">
<node TEXT="Type et qualit&#xe9;" ID="ID_1894181819" CREATED="1573955376045" MODIFIED="1573955381723">
<node TEXT="La distribution est pertinente pour d&#xe9;crire un vid&#xe9;o" ID="ID_993533082" CREATED="1573955400296" MODIFIED="1573955411700"/>
<node TEXT="Lien direct avec les &quot;Like&quot;, donc la rentabilit&#xe9; du vid&#xe9;o pour Youtube" ID="ID_195371144" CREATED="1573955596788" MODIFIED="1573955616313">
<node TEXT="Mod&#xe9;lisation R&#xe9;gression Neg. Binomiale" ID="ID_218766081" CREATED="1573956262222" MODIFIED="1573956280088"/>
</node>
<node TEXT="Types" ID="ID_868118585" CREATED="1573955883446" MODIFIED="1573955884542">
<node TEXT="t1: Discussion" ID="ID_1000797050" CREATED="1573955777851" MODIFIED="1573955783201"/>
<node TEXT="t3: substantiels" ID="ID_1205292896" CREATED="1573955790350" MODIFIED="1573955794297"/>
<node TEXT="t2: inf&#xe9;rieurs" ID="ID_392602295" CREATED="1573955783999" MODIFIED="1573955789685"/>
</node>
<node TEXT="Outils" ID="ID_144483136" CREATED="1573955937321" MODIFIED="1573955941603">
<node TEXT="Offensive: SentiStrength" ID="ID_1491683881" CREATED="1573955828111" MODIFIED="1573955833421"/>
<node TEXT="Liste manuelle de marqueurs &#xe9;motionnels" ID="ID_217513801" CREATED="1573955864375" MODIFIED="1573955877200"/>
<node TEXT="Appariment de mots-cl&#xe9;s et du titre" ID="ID_1268991488" CREATED="1573955952300" MODIFIED="1573955975377"/>
</node>
<node TEXT="" ID="ID_667449169" CREATED="1573955947623" MODIFIED="1573955947623"/>
</node>
<node TEXT="Permettent l&apos;analyse s&#xe9;mantique du vid&#xe9;o" ID="ID_1291677418" CREATED="1573955433090" MODIFIED="1573955442246">
<node TEXT="Mod&#xe9;lisation avec un SVM" ID="ID_1415408576" CREATED="1573956080194" MODIFIED="1573956086398">
<node TEXT="Variable r&#xe9;ponse: cat&#xe9;gorie du vid&#xe9;o (News, Sports, Music, ...)" ID="ID_92620342" CREATED="1573956116685" MODIFIED="1573956166425"/>
<node TEXT="Caract&#xe9;ristiques: Type et qualit&#xe9; des commentaires" ID="ID_575459159" CREATED="1573956125581" MODIFIED="1573956141215"/>
</node>
<node TEXT="" ID="ID_1244708423" CREATED="1573956216954" MODIFIED="1573956259964"/>
</node>
</node>
<node TEXT="Inspiration de Ammari et al." ID="ID_473432831" CREATED="1573955655134" MODIFIED="1573955845106">
<node TEXT="Identifier les commentaires &quot;noisy&quot;" ID="ID_156080744" CREATED="1573955669957" MODIFIED="1573955686377"/>
</node>
<node TEXT="https://www.semanticscholar.org/paper/Leave-a-Comment!-An-In-Depth-Analysis-of-User-on-Schultes-Dorner/d84ec961f13ebc56bd45f63ac78a6e07bbba2a63" ID="ID_779661135" CREATED="1573956897563" MODIFIED="1573956898796"/>
</node>
<node TEXT="Impl&#xe9;mentation" POSITION="right" ID="ID_603062380" CREATED="1573956867473" MODIFIED="1573956871207">
<edge COLOR="#ff00ff"/>
<node TEXT="https://zablo.net/blog/post/twitter-sentiment-analysis-python-scikit-word2vec-nltk-xgboost/index.html" ID="ID_646162134" CREATED="1573956872038" MODIFIED="1573956872849"/>
</node>
<node TEXT="Halt&#xe9; - Emoticones" POSITION="right" ID="ID_1763549449" CREATED="1575238775212" MODIFIED="1575238785034">
<edge COLOR="#00ffff"/>
<node TEXT="Emoticones" ID="ID_386506934" CREATED="1575238883772" MODIFIED="1575238887445">
<node TEXT="Mim&#xe9;tique et gestuelle" ID="ID_1631325076" CREATED="1575239148209" MODIFIED="1575239154376"/>
<node TEXT="Plus fort que les interjections" ID="ID_886136844" CREATED="1575240727401" MODIFIED="1575240735333"/>
<node TEXT="Syst&#xe8;me d&apos;&#xe9;criture" ID="ID_525765510" CREATED="1575239409720" MODIFIED="1575239420069">
<node TEXT="R&#xe8;gles" ID="ID_1483343948" CREATED="1575239421113" MODIFIED="1575239424034"/>
<node TEXT="Oppositions" ID="ID_1849414542" CREATED="1575239424924" MODIFIED="1575239427759"/>
<node TEXT="Usages" ID="ID_1764780335" CREATED="1575239428189" MODIFIED="1575239429883"/>
</node>
<node TEXT="Emojis" FOLDED="true" ID="ID_838340950" CREATED="1575242128366" MODIFIED="1575242133912">
<node TEXT="Banque normalis&#xe9;e" ID="ID_1322377772" CREATED="1575242133919" MODIFIED="1575242238282">
<node TEXT="Verbal" ID="ID_1610044649" CREATED="1575242238977" MODIFIED="1575242243717">
<node TEXT="Modalit&#xe9;" ID="ID_1389558503" CREATED="1575242266230" MODIFIED="1575242268798"/>
</node>
<node TEXT="Non-verbal" FOLDED="true" ID="ID_430063472" CREATED="1575242244105" MODIFIED="1575242250451">
<node TEXT="Objets" ID="ID_900313374" CREATED="1575242250461" MODIFIED="1575242253644"/>
<node TEXT="Actions" ID="ID_1522816072" CREATED="1575242254098" MODIFIED="1575242264462"/>
</node>
<node TEXT="Diff&#xe9;rentes parties du langage (Pierce)" ID="ID_1811986887" CREATED="1575242392026" MODIFIED="1575242450835">
<node TEXT="Iconique" ID="ID_1371581343" CREATED="1575242421848" MODIFIED="1575242424675"/>
<node TEXT="Indiciel" ID="ID_213205468" CREATED="1575242425224" MODIFIED="1575242427614"/>
<node TEXT="Symbolique" ID="ID_349389975" CREATED="1575242428002" MODIFIED="1575242430884"/>
</node>
</node>
</node>
<node TEXT="Provine et al. Le sens de l&apos;&#xe9;moticone est une information additionnelle ou compl&#xe9;mentaire au message" ID="ID_790078370" CREATED="1575245786150" MODIFIED="1575245837675">
<node TEXT="Nues" ID="ID_1650267953" CREATED="1575245860851" MODIFIED="1575245863060"/>
<node TEXT="D&#xe9;but ou fin (+ fr&#xe9;quent)" ID="ID_870280863" CREATED="1575245863647" MODIFIED="1575245884155"/>
<node TEXT="Int&#xe9;rieures" ID="ID_37382522" CREATED="1575245868733" MODIFIED="1575245871544"/>
</node>
</node>
<node TEXT="Interjections et sigles" FOLDED="true" ID="ID_818337825" CREATED="1575238887904" MODIFIED="1575238893112">
<node TEXT="Attitude subjective" ID="ID_1458901294" CREATED="1575238961067" MODIFIED="1575238969600"/>
<node TEXT="Remplace les gestes, mimiques, intonations" ID="ID_1708771216" CREATED="1575238786167" MODIFIED="1575238882360"/>
<node TEXT="montrer plut&#xf4;t que dire" ID="ID_1458650089" CREATED="1575239025028" MODIFIED="1575239031427"/>
</node>
<node TEXT="Le tchat" ID="ID_972070502" CREATED="1575240740283" MODIFIED="1575240747317">
<node TEXT="Parfois synchrone, parfois non." ID="ID_1205295086" CREATED="1575240748630" MODIFIED="1575240779677"/>
<node TEXT="Indices contextuels forts" ID="ID_77869570" CREATED="1575240780110" MODIFIED="1575240786499"/>
<node TEXT="Tours de paroles segment&#xe9;s" ID="ID_971520480" CREATED="1575240805520" MODIFIED="1575240878843"/>
<node TEXT="Conversations entrelac&#xe9;es" ID="ID_464582331" CREATED="1575240929788" MODIFIED="1575240941056">
<node TEXT="Quidproquo" ID="ID_1623621173" CREATED="1575240943203" MODIFIED="1575240948763"/>
<node TEXT="Situations humoristiques" ID="ID_816264224" CREATED="1575240949163" MODIFIED="1575240953649"/>
</node>
<node TEXT="&#xc9;moticone: Port&#xe9;e variable" ID="ID_94048404" CREATED="1575260093511" MODIFIED="1575260103300"/>
<node TEXT="&#xc9;nonc&#xe9;s sur plusieurs lignes, s&#xe9;parations syntaxiques ou non" ID="ID_1730871314" CREATED="1575260541509" MODIFIED="1575260562710"/>
<node TEXT="Prise en compte/prise en charge" ID="ID_266160815" CREATED="1575260407487" MODIFIED="1575260417401">
<node TEXT="L&apos;&#xe9;moticone permet de ne pas prendre position, mais de montrer qu&apos;on a bien re&#xe7;u ce qui a &#xe9;t&#xe9; dit" ID="ID_362215456" CREATED="1575260417403" MODIFIED="1575260451519"/>
</node>
</node>
</node>
<node TEXT="Georgalou - Discourse and identity on Facebook" POSITION="left" ID="ID_573961500" CREATED="1576432501896" MODIFIED="1576439987458">
<edge COLOR="#7c0000"/>
<node TEXT="Nouveaux &#xe9;l&#xe9;ments du langage" ID="ID_884645106" CREATED="1576432511262" MODIFIED="1576432547779">
<node TEXT="Ponctuations multiples ?!" ID="ID_1995619041" CREATED="1576432549422" MODIFIED="1576437103533"/>
<node TEXT="Interjections" ID="ID_607633784" CREATED="1576437113783" MODIFIED="1576437117640"/>
<node TEXT="Majuscules" ID="ID_1383386533" CREATED="1576437118071" MODIFIED="1576437122120"/>
</node>
<node TEXT="Analyse du discours (Baxter 2010)" ID="ID_1424801803" CREATED="1576432904446" MODIFIED="1576433023929">
<node TEXT="Variabilit&#xe9;" ID="ID_1655319920" CREATED="1576432921042" MODIFIED="1576432935538">
<node TEXT="Audience" ID="ID_1287238103" CREATED="1576432935545" MODIFIED="1576432942327"/>
<node TEXT="Contexte" ID="ID_1017431291" CREATED="1576432943825" MODIFIED="1576432954803"/>
</node>
<node TEXT="Nature du langage" ID="ID_397305792" CREATED="1576432966755" MODIFIED="1576432977141">
<node TEXT="Description" ID="ID_950767559" CREATED="1576432978593" MODIFIED="1576432983163"/>
<node TEXT="Narration" ID="ID_868814925" CREATED="1576432983797" MODIFIED="1576432987481"/>
<node TEXT="Remarques" ID="ID_877450704" CREATED="1576432988675" MODIFIED="1576432996039"/>
<node TEXT="Commentaires" ID="ID_1281731741" CREATED="1576432996663" MODIFIED="1576433001128"/>
<node TEXT="Blagues" ID="ID_1309797672" CREATED="1576433001508" MODIFIED="1576433003773"/>
</node>
<node TEXT="R&#xe9;pertoire" ID="ID_1024196052" CREATED="1576433048058" MODIFIED="1576433050691">
<node TEXT="Vocabulaire" ID="ID_709732839" CREATED="1576433053478" MODIFIED="1576433057503"/>
<node TEXT="Grammaire" ID="ID_736734389" CREATED="1576433058284" MODIFIED="1576433066147"/>
<node TEXT="Figures de style" ID="ID_137013127" CREATED="1576433075268" MODIFIED="1576433080277"/>
</node>
<node TEXT="Approche" ID="ID_1561649830" CREATED="1576433106369" MODIFIED="1576433112084">
<node TEXT="Contexte psychologique" ID="ID_1939058865" CREATED="1576433112090" MODIFIED="1576433123678"/>
<node TEXT="Contexte sociopolitique" ID="ID_791737082" CREATED="1576433124443" MODIFIED="1576433130862"/>
</node>
</node>
<node TEXT="&#xc9;l&#xe9;ments d&apos;analyse" ID="ID_994837948" CREATED="1576433185944" MODIFIED="1576433207370">
<node TEXT="Intertextualit&#xe9;" ID="ID_1353601895" CREATED="1576433212711" MODIFIED="1576433223175">
<node TEXT="Liens avec les textes pr&#xe9;c&#xe9;dents" ID="ID_261026690" CREATED="1576433278055" MODIFIED="1576433286703"/>
</node>
<node TEXT="Interdiscursivit&#xe9;" ID="ID_1355846398" CREATED="1576433224084" MODIFIED="1576433232077">
<node TEXT="Interaction et superposition des &#xe9;changes" ID="ID_1296550423" CREATED="1576433265025" MODIFIED="1576433274898"/>
</node>
<node TEXT="Multimodalit&#xe9;" ID="ID_398609083" CREATED="1576433232685" MODIFIED="1576433253744">
<node TEXT="Images" ID="ID_996455266" CREATED="1576433241535" MODIFIED="1576433243920"/>
<node TEXT="Textes" ID="ID_1799781603" CREATED="1576433244372" MODIFIED="1576433245825"/>
<node TEXT="Vid&#xe9;os" ID="ID_421888374" CREATED="1576433246259" MODIFIED="1576433247808"/>
</node>
</node>
<node TEXT="Localisation" FOLDED="true" ID="ID_206247466" CREATED="1576433752181" MODIFIED="1576433841871">
<node TEXT="Textualisation" FOLDED="true" ID="ID_1774810454" CREATED="1576433842212" MODIFIED="1576433850744">
<node TEXT="&#xc9;l&#xe9;ment culturel" ID="ID_726017284" CREATED="1576433850747" MODIFIED="1576433859642"/>
</node>
<node TEXT="M&#xe9;tonymie" FOLDED="true" ID="ID_1450191917" CREATED="1576433862048" MODIFIED="1576433910984">
<node TEXT="Inclus la localisation, mais dans un autre type lexical" ID="ID_1282192414" CREATED="1576433915495" MODIFIED="1576433947584"/>
</node>
<node TEXT="Personnification" ID="ID_1888056104" CREATED="1576433968022" MODIFIED="1576433972282"/>
<node TEXT="S&#xe9;miotique transgressive" ID="ID_508849276" CREATED="1576434327152" MODIFIED="1576434342653">
<node TEXT="Signes qui ne vont pas ensemble, dans un m&#xea;me discours. Pour marquer l&apos;opposition implicite" ID="ID_1864195644" CREATED="1576434347915" MODIFIED="1576434379898"/>
</node>
</node>
<node TEXT="Temps" FOLDED="true" ID="ID_1259089697" CREATED="1576436972135" MODIFIED="1576436974049">
<node TEXT="Temporalit&#xe9;" ID="ID_935107929" CREATED="1576436975051" MODIFIED="1576436979169"/>
<node TEXT="Notion de maintenant" ID="ID_63573093" CREATED="1576436979945" MODIFIED="1576436985093"/>
<node TEXT="Cycles" ID="ID_1458339480" CREATED="1576437010439" MODIFIED="1576437014557"/>
<node TEXT="Saisons, f&#xea;tes" ID="ID_566591651" CREATED="1576437044816" MODIFIED="1576437049959"/>
<node TEXT="Pass&#xe9; et futur" ID="ID_1383560062" CREATED="1576437051190" MODIFIED="1576437056318"/>
<node TEXT="&#xc2;ge, anniversaires" ID="ID_1886330627" CREATED="1576437056834" MODIFIED="1576437061864"/>
</node>
<node TEXT="&#xc9;ducation et expertise" FOLDED="true" ID="ID_869848700" CREATED="1576437570827" MODIFIED="1576437579348">
<node TEXT="Montrer son expertise" ID="ID_1657747338" CREATED="1576437580284" MODIFIED="1576437611763"/>
<node TEXT="Se r&#xe9;clamer le droit de mener la discussion (entitlement)" ID="ID_1490384821" CREATED="1576437612334" MODIFIED="1576437625926"/>
<node TEXT="Montrer ses r&#xe9;ussites acad&#xe9;miques" ID="ID_1508946328" CREATED="1576437628508" MODIFIED="1576437637956"/>
<node TEXT="R&#xe9;f&#xe9;rences, imp&#xe9;ratifs, pr&#xe9;supposition, souhaits" ID="ID_718153801" CREATED="1576437668961" MODIFIED="1576437681186"/>
</node>
<node TEXT="Position" ID="ID_727200733" CREATED="1576438549864" MODIFIED="1576438559654">
<node TEXT="Expression" FOLDED="true" ID="ID_986508069" CREATED="1576438633543" MODIFIED="1576438637443">
<node TEXT="Attitude" ID="ID_274391283" CREATED="1576438566378" MODIFIED="1576438593868"/>
<node TEXT="&#xc9;motion" ID="ID_920344361" CREATED="1576438594326" MODIFIED="1576438596613"/>
<node TEXT="Croyance" ID="ID_592625157" CREATED="1576438596770" MODIFIED="1576438599820"/>
<node TEXT="&#xc9;valuation/jugement" ID="ID_717619617" CREATED="1576438600172" MODIFIED="1576438605956"/>
<node TEXT="Engagement" ID="ID_1367600286" CREATED="1576438606159" MODIFIED="1576438609780"/>
</node>
<node TEXT="Attributs linguistiques" ID="ID_825473217" CREATED="1576438657794" MODIFIED="1576438663960">
<node TEXT="Modalit&#xe9;" ID="ID_197540513" CREATED="1576438664912" MODIFIED="1576438672368"/>
<node TEXT="&#xc9;valuation" ID="ID_1335244587" CREATED="1576438672980" MODIFIED="1576438676694"/>
<node TEXT="Politesse" ID="ID_247379549" CREATED="1576438698682" MODIFIED="1576438703989"/>
<node TEXT="&#xc9;videntialit&#xe9;" ID="ID_1818362871" CREATED="1576438704862" MODIFIED="1576438708497"/>
<node TEXT="Intensit&#xe9;" ID="ID_517078263" CREATED="1576438742226" MODIFIED="1576438746807"/>
</node>
</node>
</node>
</node>
</map>

+ 57
- 0
parsing_functions.py View File

@@ -0,0 +1,57 @@
import os
import re
import pandas as pd
import requests
from urllib.parse import unquote
import newspaper

def get_comments(file_path):
df = pd.read_excel(file_path, skiprows=5,
names=['comment_id',
'nested_id',
'name',
'id',
'date',
'likes',
'comment',
'source'])
df["comment_id"] = df["comment_id"].mask(pd.isnull,df["nested_id"].mask(pd.isnull,"0-0").apply(lambda x: re.match(r"([0-9]+)\-([0-9]+)",x).group(1)))
df["nested_id"] = df["nested_id"].mask(pd.isnull,"0-0").apply(lambda x: re.match(r"([0-9]+)\-([0-9]+)",x).group(2))
del df["source"]
return df

def get_text_article(file_path):
url = pd.read_excel(file_path,skiprows=1,nrows=1,header=None,names=['source','url']+['']*6)['url'][0]
request_url = requests.get(url)
html_content = str(request_url.content)
link_urls = re.findall(r'http[s]?://l\.facebook(?:[a-zA-Z]|[0-9]|[$-_@.&+]|[!*\(\),]|(?:%[0-9a-fA-F][0-9a-fA-F]))+',html_content)
u_link_urls = [unquote(unquote(u.replace("https://l.facebook.com/l.php?u=",""))) for u in link_urls]
article = newspaper.Article('')
try:
url_article = [re.search("(.*)&amp;.*",u).group(1) for u in u_link_urls][-3]
request_url_article = requests.get(url_article)
article.set_html(request_url_article.content)
except:
article.set_html(html_content)
try:
article.parse()
text_article = article.text.replace("\n\n"," ")
except:
text_article = ""
return text_article

def getListOfFiles(dirName):
# create a list of file and sub directories
# names in the given directory
listOfFile = os.listdir(dirName)
allFiles = list()
# Iterate over all the entries
for entry in listOfFile:
# Create full path
fullPath = os.path.join(dirName, entry)
# If entry is a directory then get the list of files in this directory
if os.path.isdir(fullPath):
allFiles = allFiles + getListOfFiles(fullPath)
else:
allFiles.append(fullPath)
return allFiles

+ 113
- 0
textes_articles.ipynb View File

@@ -0,0 +1,113 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [],
"source": [
"import parsing_functions as pf\n",
"import re\n",
"import pandas as pd\n",
"import time"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"textes_articles = []\n",
"for xlpath in listOfFiles:\n",
" time.sleep(3)\n",
" media, post_id = re.match(r\"data/([A-Z]+)/comments([0-9a-z\\-]+)\\.xlsx\",xlpath).groups()\n",
" textes_articles.append([media,post_id,pf.get_text_article(xlpath)])"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'textes_articles' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-4-b88ef33508d0>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtextes_articles_df\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtextes_articles\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'media'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'post_id'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'text'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mNameError\u001b[0m: name 'textes_articles' is not defined"
]
}
],
"source": [
"textes_articles_df = pd.DataFrame(textes_articles, columns=['media','post_id','text'])"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'textes_articles_df' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-5-cc028516ec1f>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtextes_articles_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"textes_articles_df.csv\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mNameError\u001b[0m: name 'textes_articles_df' is not defined"
]
}
],
"source": [
"textes_articles_df.to_csv(\"textes_articles_df.csv\")"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"textes_articles_df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

Loading…
Cancel
Save