Browse Source

préparation données terminée

master
FRANCOIS PELLETIER 9 months ago
parent
commit
c60ee17705
7 changed files with 192 additions and 2162 deletions
  1. +61
    -0
      Analyse_Articles.ipynb
  2. +0
    -261
      Commentaires sur les réseaux sociaux.mm
  3. +7
    -69
      Traitement Articles.ipynb
  4. +59
    -1792
      Traitement commentaires.ipynb
  5. +5
    -5
      commentaires.ipynb
  6. +54
    -0
      pretraitement.py
  7. +6
    -35
      textes_articles.ipynb

+ 61
- 0
Analyse_Articles.ipynb View File

@@ -0,0 +1,61 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"import pickle"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"f = open(\"pickle/textes_articles_df.pickle\",\"rb\")\n",
"textes_articles_df = pickle.load(f)\n",
"f.close()"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"textes_articles_df"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.3"
}
},
"nbformat": 4,
"nbformat_minor": 4
}

+ 0
- 261
Commentaires sur les réseaux sociaux.mm View File

@@ -1,261 +0,0 @@
<map version="freeplane 1.7.0">
<!--To view this file, download free mind mapping software Freeplane from http://freeplane.sourceforge.net -->
<node TEXT="Commentaires sur les r&#xe9;seaux sociaux" FOLDED="false" ID="ID_1075161201" CREATED="1573954855044" MODIFIED="1573954867944" STYLE="oval">
<font SIZE="18"/>
<hook NAME="MapStyle">
<properties edgeColorConfiguration="#808080ff,#ff0000ff,#0000ffff,#00ff00ff,#ff00ffff,#00ffffff,#7c0000ff,#00007cff,#007c00ff,#7c007cff,#007c7cff,#7c7c00ff" fit_to_viewport="false"/>

<map_styles>
<stylenode LOCALIZED_TEXT="styles.root_node" STYLE="oval" UNIFORM_SHAPE="true" VGAP_QUANTITY="24.0 pt">
<font SIZE="24"/>
<stylenode LOCALIZED_TEXT="styles.predefined" POSITION="right" STYLE="bubble">
<stylenode LOCALIZED_TEXT="default" ICON_SIZE="12.0 pt" COLOR="#000000" STYLE="fork">
<font NAME="SansSerif" SIZE="10" BOLD="false" ITALIC="false"/>
</stylenode>
<stylenode LOCALIZED_TEXT="defaultstyle.details"/>
<stylenode LOCALIZED_TEXT="defaultstyle.attributes">
<font SIZE="9"/>
</stylenode>
<stylenode LOCALIZED_TEXT="defaultstyle.note" COLOR="#000000" BACKGROUND_COLOR="#ffffff" TEXT_ALIGN="LEFT"/>
<stylenode LOCALIZED_TEXT="defaultstyle.floating">
<edge STYLE="hide_edge"/>
<cloud COLOR="#f0f0f0" SHAPE="ROUND_RECT"/>
</stylenode>
</stylenode>
<stylenode LOCALIZED_TEXT="styles.user-defined" POSITION="right" STYLE="bubble">
<stylenode LOCALIZED_TEXT="styles.topic" COLOR="#18898b" STYLE="fork">
<font NAME="Liberation Sans" SIZE="10" BOLD="true"/>
</stylenode>
<stylenode LOCALIZED_TEXT="styles.subtopic" COLOR="#cc3300" STYLE="fork">
<font NAME="Liberation Sans" SIZE="10" BOLD="true"/>
</stylenode>
<stylenode LOCALIZED_TEXT="styles.subsubtopic" COLOR="#669900">
<font NAME="Liberation Sans" SIZE="10" BOLD="true"/>
</stylenode>
<stylenode LOCALIZED_TEXT="styles.important">
<icon BUILTIN="yes"/>
</stylenode>
</stylenode>
<stylenode LOCALIZED_TEXT="styles.AutomaticLayout" POSITION="right" STYLE="bubble">
<stylenode LOCALIZED_TEXT="AutomaticLayout.level.root" COLOR="#000000" STYLE="oval" SHAPE_HORIZONTAL_MARGIN="10.0 pt" SHAPE_VERTICAL_MARGIN="10.0 pt">
<font SIZE="18"/>
</stylenode>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,1" COLOR="#0033ff">
<font SIZE="16"/>
</stylenode>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,2" COLOR="#00b439">
<font SIZE="14"/>
</stylenode>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,3" COLOR="#990000">
<font SIZE="12"/>
</stylenode>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,4" COLOR="#111111">
<font SIZE="10"/>
</stylenode>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,5"/>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,6"/>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,7"/>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,8"/>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,9"/>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,10"/>
<stylenode LOCALIZED_TEXT="AutomaticLayout.level,11"/>
</stylenode>
</stylenode>
</map_styles>
</hook>
<hook NAME="AutomaticEdgeColor" COUNTER="6" RULE="ON_BRANCH_CREATION"/>
<node TEXT="Schultes" POSITION="left" ID="ID_1889975585" CREATED="1573955304008" MODIFIED="1573955306512">
<edge COLOR="#00ff00"/>
<node TEXT="Classification Youtube" ID="ID_960774191" CREATED="1573954869370" MODIFIED="1573955309655">
<node TEXT="&#xc9;tude" ID="ID_1621340516" CREATED="1573955028436" MODIFIED="1573955040248">
<node TEXT="Aggressif: 42%" ID="ID_221633146" CREATED="1573954944708" MODIFIED="1573954950067"/>
<node TEXT="Essentiels: 6%" ID="ID_1440746951" CREATED="1573954967031" MODIFIED="1573954981297"/>
<node TEXT="Stupide: 51%" ID="ID_921312086" CREATED="1573954957415" MODIFIED="1573954966495"/>
<node TEXT="Non pertinents: 64%" ID="ID_1705312749" CREATED="1573954934920" MODIFIED="1573954944172"/>
</node>
<node TEXT="The Guardian 2009" ID="ID_352535949" CREATED="1573955053842" MODIFIED="1573955061455">
<node TEXT="Juvenile" ID="ID_1646713003" CREATED="1573955061461" MODIFIED="1573955078889"/>
<node TEXT="Aggressive" ID="ID_862048526" CREATED="1573955079074" MODIFIED="1573955081407"/>
<node TEXT="Misspelled" ID="ID_286709909" CREATED="1573955081602" MODIFIED="1573955087293"/>
<node TEXT="Sexist" ID="ID_153876225" CREATED="1573955087779" MODIFIED="1573955092833"/>
</node>
</node>
<node TEXT="Pourquoi commenter?" ID="ID_866509994" CREATED="1573955165314" MODIFIED="1573955312129">
<node TEXT="12% commentent" ID="ID_1041837989" CREATED="1573955193866" MODIFIED="1573955202502"/>
<node TEXT="34% lisent les commentaires" ID="ID_1372530703" CREATED="1573955232869" MODIFIED="1573955240608"/>
<node TEXT="53% regardent les 2-3 premiers commentaires" ID="ID_1662261189" CREATED="1573955241001" MODIFIED="1573955255585"/>
</node>
<node TEXT="Classification en deux &#xe9;tapes" ID="ID_1588097956" CREATED="1573955358766" MODIFIED="1573955365555">
<node TEXT="Type et qualit&#xe9;" ID="ID_1894181819" CREATED="1573955376045" MODIFIED="1573955381723">
<node TEXT="La distribution est pertinente pour d&#xe9;crire un vid&#xe9;o" ID="ID_993533082" CREATED="1573955400296" MODIFIED="1573955411700"/>
<node TEXT="Lien direct avec les &quot;Like&quot;, donc la rentabilit&#xe9; du vid&#xe9;o pour Youtube" ID="ID_195371144" CREATED="1573955596788" MODIFIED="1573955616313">
<node TEXT="Mod&#xe9;lisation R&#xe9;gression Neg. Binomiale" ID="ID_218766081" CREATED="1573956262222" MODIFIED="1573956280088"/>
</node>
<node TEXT="Types" ID="ID_868118585" CREATED="1573955883446" MODIFIED="1573955884542">
<node TEXT="t1: Discussion" ID="ID_1000797050" CREATED="1573955777851" MODIFIED="1573955783201"/>
<node TEXT="t3: substantiels" ID="ID_1205292896" CREATED="1573955790350" MODIFIED="1573955794297"/>
<node TEXT="t2: inf&#xe9;rieurs" ID="ID_392602295" CREATED="1573955783999" MODIFIED="1573955789685"/>
</node>
<node TEXT="Outils" ID="ID_144483136" CREATED="1573955937321" MODIFIED="1573955941603">
<node TEXT="Offensive: SentiStrength" ID="ID_1491683881" CREATED="1573955828111" MODIFIED="1573955833421"/>
<node TEXT="Liste manuelle de marqueurs &#xe9;motionnels" ID="ID_217513801" CREATED="1573955864375" MODIFIED="1573955877200"/>
<node TEXT="Appariment de mots-cl&#xe9;s et du titre" ID="ID_1268991488" CREATED="1573955952300" MODIFIED="1573955975377"/>
</node>
<node TEXT="" ID="ID_667449169" CREATED="1573955947623" MODIFIED="1573955947623"/>
</node>
<node TEXT="Permettent l&apos;analyse s&#xe9;mantique du vid&#xe9;o" ID="ID_1291677418" CREATED="1573955433090" MODIFIED="1573955442246">
<node TEXT="Mod&#xe9;lisation avec un SVM" ID="ID_1415408576" CREATED="1573956080194" MODIFIED="1573956086398">
<node TEXT="Variable r&#xe9;ponse: cat&#xe9;gorie du vid&#xe9;o (News, Sports, Music, ...)" ID="ID_92620342" CREATED="1573956116685" MODIFIED="1573956166425"/>
<node TEXT="Caract&#xe9;ristiques: Type et qualit&#xe9; des commentaires" ID="ID_575459159" CREATED="1573956125581" MODIFIED="1573956141215"/>
</node>
<node TEXT="" ID="ID_1244708423" CREATED="1573956216954" MODIFIED="1573956259964"/>
</node>
</node>
<node TEXT="Inspiration de Ammari et al." ID="ID_473432831" CREATED="1573955655134" MODIFIED="1573955845106">
<node TEXT="Identifier les commentaires &quot;noisy&quot;" ID="ID_156080744" CREATED="1573955669957" MODIFIED="1573955686377"/>
</node>
<node TEXT="https://www.semanticscholar.org/paper/Leave-a-Comment!-An-In-Depth-Analysis-of-User-on-Schultes-Dorner/d84ec961f13ebc56bd45f63ac78a6e07bbba2a63" ID="ID_779661135" CREATED="1573956897563" MODIFIED="1573956898796"/>
</node>
<node TEXT="Impl&#xe9;mentation" POSITION="right" ID="ID_603062380" CREATED="1573956867473" MODIFIED="1573956871207">
<edge COLOR="#ff00ff"/>
<node TEXT="https://zablo.net/blog/post/twitter-sentiment-analysis-python-scikit-word2vec-nltk-xgboost/index.html" ID="ID_646162134" CREATED="1573956872038" MODIFIED="1573956872849"/>
</node>
<node TEXT="Halt&#xe9; - Emoticones" POSITION="right" ID="ID_1763549449" CREATED="1575238775212" MODIFIED="1575238785034">
<edge COLOR="#00ffff"/>
<node TEXT="Emoticones" ID="ID_386506934" CREATED="1575238883772" MODIFIED="1575238887445">
<node TEXT="Mim&#xe9;tique et gestuelle" ID="ID_1631325076" CREATED="1575239148209" MODIFIED="1575239154376"/>
<node TEXT="Plus fort que les interjections" ID="ID_886136844" CREATED="1575240727401" MODIFIED="1575240735333"/>
<node TEXT="Syst&#xe8;me d&apos;&#xe9;criture" ID="ID_525765510" CREATED="1575239409720" MODIFIED="1575239420069">
<node TEXT="R&#xe8;gles" ID="ID_1483343948" CREATED="1575239421113" MODIFIED="1575239424034"/>
<node TEXT="Oppositions" ID="ID_1849414542" CREATED="1575239424924" MODIFIED="1575239427759"/>
<node TEXT="Usages" ID="ID_1764780335" CREATED="1575239428189" MODIFIED="1575239429883"/>
</node>
<node TEXT="Emojis" FOLDED="true" ID="ID_838340950" CREATED="1575242128366" MODIFIED="1575242133912">
<node TEXT="Banque normalis&#xe9;e" ID="ID_1322377772" CREATED="1575242133919" MODIFIED="1575242238282">
<node TEXT="Verbal" ID="ID_1610044649" CREATED="1575242238977" MODIFIED="1575242243717">
<node TEXT="Modalit&#xe9;" ID="ID_1389558503" CREATED="1575242266230" MODIFIED="1575242268798"/>
</node>
<node TEXT="Non-verbal" FOLDED="true" ID="ID_430063472" CREATED="1575242244105" MODIFIED="1575242250451">
<node TEXT="Objets" ID="ID_900313374" CREATED="1575242250461" MODIFIED="1575242253644"/>
<node TEXT="Actions" ID="ID_1522816072" CREATED="1575242254098" MODIFIED="1575242264462"/>
</node>
<node TEXT="Diff&#xe9;rentes parties du langage (Pierce)" ID="ID_1811986887" CREATED="1575242392026" MODIFIED="1575242450835">
<node TEXT="Iconique" ID="ID_1371581343" CREATED="1575242421848" MODIFIED="1575242424675"/>
<node TEXT="Indiciel" ID="ID_213205468" CREATED="1575242425224" MODIFIED="1575242427614"/>
<node TEXT="Symbolique" ID="ID_349389975" CREATED="1575242428002" MODIFIED="1575242430884"/>
</node>
</node>
</node>
<node TEXT="Provine et al. Le sens de l&apos;&#xe9;moticone est une information additionnelle ou compl&#xe9;mentaire au message" ID="ID_790078370" CREATED="1575245786150" MODIFIED="1575245837675">
<node TEXT="Nues" ID="ID_1650267953" CREATED="1575245860851" MODIFIED="1575245863060"/>
<node TEXT="D&#xe9;but ou fin (+ fr&#xe9;quent)" ID="ID_870280863" CREATED="1575245863647" MODIFIED="1575245884155"/>
<node TEXT="Int&#xe9;rieures" ID="ID_37382522" CREATED="1575245868733" MODIFIED="1575245871544"/>
</node>
</node>
<node TEXT="Interjections et sigles" FOLDED="true" ID="ID_818337825" CREATED="1575238887904" MODIFIED="1575238893112">
<node TEXT="Attitude subjective" ID="ID_1458901294" CREATED="1575238961067" MODIFIED="1575238969600"/>
<node TEXT="Remplace les gestes, mimiques, intonations" ID="ID_1708771216" CREATED="1575238786167" MODIFIED="1575238882360"/>
<node TEXT="montrer plut&#xf4;t que dire" ID="ID_1458650089" CREATED="1575239025028" MODIFIED="1575239031427"/>
</node>
<node TEXT="Le tchat" ID="ID_972070502" CREATED="1575240740283" MODIFIED="1575240747317">
<node TEXT="Parfois synchrone, parfois non." ID="ID_1205295086" CREATED="1575240748630" MODIFIED="1575240779677"/>
<node TEXT="Indices contextuels forts" ID="ID_77869570" CREATED="1575240780110" MODIFIED="1575240786499"/>
<node TEXT="Tours de paroles segment&#xe9;s" ID="ID_971520480" CREATED="1575240805520" MODIFIED="1575240878843"/>
<node TEXT="Conversations entrelac&#xe9;es" ID="ID_464582331" CREATED="1575240929788" MODIFIED="1575240941056">
<node TEXT="Quidproquo" ID="ID_1623621173" CREATED="1575240943203" MODIFIED="1575240948763"/>
<node TEXT="Situations humoristiques" ID="ID_816264224" CREATED="1575240949163" MODIFIED="1575240953649"/>
</node>
<node TEXT="&#xc9;moticone: Port&#xe9;e variable" ID="ID_94048404" CREATED="1575260093511" MODIFIED="1575260103300"/>
<node TEXT="&#xc9;nonc&#xe9;s sur plusieurs lignes, s&#xe9;parations syntaxiques ou non" ID="ID_1730871314" CREATED="1575260541509" MODIFIED="1575260562710"/>
<node TEXT="Prise en compte/prise en charge" ID="ID_266160815" CREATED="1575260407487" MODIFIED="1575260417401">
<node TEXT="L&apos;&#xe9;moticone permet de ne pas prendre position, mais de montrer qu&apos;on a bien re&#xe7;u ce qui a &#xe9;t&#xe9; dit" ID="ID_362215456" CREATED="1575260417403" MODIFIED="1575260451519"/>
</node>
</node>
</node>
<node TEXT="Georgalou - Discourse and identity on Facebook" POSITION="left" ID="ID_573961500" CREATED="1576432501896" MODIFIED="1576439987458">
<edge COLOR="#7c0000"/>
<node TEXT="Nouveaux &#xe9;l&#xe9;ments du langage" ID="ID_884645106" CREATED="1576432511262" MODIFIED="1576432547779">
<node TEXT="Ponctuations multiples ?!" ID="ID_1995619041" CREATED="1576432549422" MODIFIED="1576437103533"/>
<node TEXT="Interjections" ID="ID_607633784" CREATED="1576437113783" MODIFIED="1576437117640"/>
<node TEXT="Majuscules" ID="ID_1383386533" CREATED="1576437118071" MODIFIED="1576437122120"/>
</node>
<node TEXT="Analyse du discours (Baxter 2010)" ID="ID_1424801803" CREATED="1576432904446" MODIFIED="1576433023929">
<node TEXT="Variabilit&#xe9;" ID="ID_1655319920" CREATED="1576432921042" MODIFIED="1576432935538">
<node TEXT="Audience" ID="ID_1287238103" CREATED="1576432935545" MODIFIED="1576432942327"/>
<node TEXT="Contexte" ID="ID_1017431291" CREATED="1576432943825" MODIFIED="1576432954803"/>
</node>
<node TEXT="Nature du langage" ID="ID_397305792" CREATED="1576432966755" MODIFIED="1576432977141">
<node TEXT="Description" ID="ID_950767559" CREATED="1576432978593" MODIFIED="1576432983163"/>
<node TEXT="Narration" ID="ID_868814925" CREATED="1576432983797" MODIFIED="1576432987481"/>
<node TEXT="Remarques" ID="ID_877450704" CREATED="1576432988675" MODIFIED="1576432996039"/>
<node TEXT="Commentaires" ID="ID_1281731741" CREATED="1576432996663" MODIFIED="1576433001128"/>
<node TEXT="Blagues" ID="ID_1309797672" CREATED="1576433001508" MODIFIED="1576433003773"/>
</node>
<node TEXT="R&#xe9;pertoire" ID="ID_1024196052" CREATED="1576433048058" MODIFIED="1576433050691">
<node TEXT="Vocabulaire" ID="ID_709732839" CREATED="1576433053478" MODIFIED="1576433057503"/>
<node TEXT="Grammaire" ID="ID_736734389" CREATED="1576433058284" MODIFIED="1576433066147"/>
<node TEXT="Figures de style" ID="ID_137013127" CREATED="1576433075268" MODIFIED="1576433080277"/>
</node>
<node TEXT="Approche" ID="ID_1561649830" CREATED="1576433106369" MODIFIED="1576433112084">
<node TEXT="Contexte psychologique" ID="ID_1939058865" CREATED="1576433112090" MODIFIED="1576433123678"/>
<node TEXT="Contexte sociopolitique" ID="ID_791737082" CREATED="1576433124443" MODIFIED="1576433130862"/>
</node>
</node>
<node TEXT="&#xc9;l&#xe9;ments d&apos;analyse" ID="ID_994837948" CREATED="1576433185944" MODIFIED="1576433207370">
<node TEXT="Intertextualit&#xe9;" ID="ID_1353601895" CREATED="1576433212711" MODIFIED="1576433223175">
<node TEXT="Liens avec les textes pr&#xe9;c&#xe9;dents" ID="ID_261026690" CREATED="1576433278055" MODIFIED="1576433286703"/>
</node>
<node TEXT="Interdiscursivit&#xe9;" ID="ID_1355846398" CREATED="1576433224084" MODIFIED="1576433232077">
<node TEXT="Interaction et superposition des &#xe9;changes" ID="ID_1296550423" CREATED="1576433265025" MODIFIED="1576433274898"/>
</node>
<node TEXT="Multimodalit&#xe9;" ID="ID_398609083" CREATED="1576433232685" MODIFIED="1576433253744">
<node TEXT="Images" ID="ID_996455266" CREATED="1576433241535" MODIFIED="1576433243920"/>
<node TEXT="Textes" ID="ID_1799781603" CREATED="1576433244372" MODIFIED="1576433245825"/>
<node TEXT="Vid&#xe9;os" ID="ID_421888374" CREATED="1576433246259" MODIFIED="1576433247808"/>
</node>
</node>
<node TEXT="Localisation" FOLDED="true" ID="ID_206247466" CREATED="1576433752181" MODIFIED="1576433841871">
<node TEXT="Textualisation" FOLDED="true" ID="ID_1774810454" CREATED="1576433842212" MODIFIED="1576433850744">
<node TEXT="&#xc9;l&#xe9;ment culturel" ID="ID_726017284" CREATED="1576433850747" MODIFIED="1576433859642"/>
</node>
<node TEXT="M&#xe9;tonymie" FOLDED="true" ID="ID_1450191917" CREATED="1576433862048" MODIFIED="1576433910984">
<node TEXT="Inclus la localisation, mais dans un autre type lexical" ID="ID_1282192414" CREATED="1576433915495" MODIFIED="1576433947584"/>
</node>
<node TEXT="Personnification" ID="ID_1888056104" CREATED="1576433968022" MODIFIED="1576433972282"/>
<node TEXT="S&#xe9;miotique transgressive" ID="ID_508849276" CREATED="1576434327152" MODIFIED="1576434342653">
<node TEXT="Signes qui ne vont pas ensemble, dans un m&#xea;me discours. Pour marquer l&apos;opposition implicite" ID="ID_1864195644" CREATED="1576434347915" MODIFIED="1576434379898"/>
</node>
</node>
<node TEXT="Temps" FOLDED="true" ID="ID_1259089697" CREATED="1576436972135" MODIFIED="1576436974049">
<node TEXT="Temporalit&#xe9;" ID="ID_935107929" CREATED="1576436975051" MODIFIED="1576436979169"/>
<node TEXT="Notion de maintenant" ID="ID_63573093" CREATED="1576436979945" MODIFIED="1576436985093"/>
<node TEXT="Cycles" ID="ID_1458339480" CREATED="1576437010439" MODIFIED="1576437014557"/>
<node TEXT="Saisons, f&#xea;tes" ID="ID_566591651" CREATED="1576437044816" MODIFIED="1576437049959"/>
<node TEXT="Pass&#xe9; et futur" ID="ID_1383560062" CREATED="1576437051190" MODIFIED="1576437056318"/>
<node TEXT="&#xc2;ge, anniversaires" ID="ID_1886330627" CREATED="1576437056834" MODIFIED="1576437061864"/>
</node>
<node TEXT="&#xc9;ducation et expertise" FOLDED="true" ID="ID_869848700" CREATED="1576437570827" MODIFIED="1576437579348">
<node TEXT="Montrer son expertise" ID="ID_1657747338" CREATED="1576437580284" MODIFIED="1576437611763"/>
<node TEXT="Se r&#xe9;clamer le droit de mener la discussion (entitlement)" ID="ID_1490384821" CREATED="1576437612334" MODIFIED="1576437625926"/>
<node TEXT="Montrer ses r&#xe9;ussites acad&#xe9;miques" ID="ID_1508946328" CREATED="1576437628508" MODIFIED="1576437637956"/>
<node TEXT="R&#xe9;f&#xe9;rences, imp&#xe9;ratifs, pr&#xe9;supposition, souhaits" ID="ID_718153801" CREATED="1576437668961" MODIFIED="1576437681186"/>
</node>
<node TEXT="Position" ID="ID_727200733" CREATED="1576438549864" MODIFIED="1576438559654">
<node TEXT="Expression" FOLDED="true" ID="ID_986508069" CREATED="1576438633543" MODIFIED="1576438637443">
<node TEXT="Attitude" ID="ID_274391283" CREATED="1576438566378" MODIFIED="1576438593868"/>
<node TEXT="&#xc9;motion" ID="ID_920344361" CREATED="1576438594326" MODIFIED="1576438596613"/>
<node TEXT="Croyance" ID="ID_592625157" CREATED="1576438596770" MODIFIED="1576438599820"/>
<node TEXT="&#xc9;valuation/jugement" ID="ID_717619617" CREATED="1576438600172" MODIFIED="1576438605956"/>
<node TEXT="Engagement" ID="ID_1367600286" CREATED="1576438606159" MODIFIED="1576438609780"/>
</node>
<node TEXT="Attributs linguistiques" ID="ID_825473217" CREATED="1576438657794" MODIFIED="1576438663960">
<node TEXT="Modalit&#xe9;" ID="ID_197540513" CREATED="1576438664912" MODIFIED="1576438672368"/>
<node TEXT="&#xc9;valuation" ID="ID_1335244587" CREATED="1576438672980" MODIFIED="1576438676694"/>
<node TEXT="Politesse" ID="ID_247379549" CREATED="1576438698682" MODIFIED="1576438703989"/>
<node TEXT="&#xc9;videntialit&#xe9;" ID="ID_1818362871" CREATED="1576438704862" MODIFIED="1576438708497"/>
<node TEXT="Intensit&#xe9;" ID="ID_517078263" CREATED="1576438742226" MODIFIED="1576438746807"/>
</node>
</node>
</node>
</node>
</map>

+ 7
- 69
Traitement Articles.ipynb View File

@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 153,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -12,7 +12,9 @@
"from nltk.tokenize import toktok, sent_tokenize\n",
"from nltk.parse import CoreNLPParser\n",
"import re\n",
"import pickle"
"import pickle\n",
"import emoji\n",
"import pretraitement as pr"
]
},
{
@@ -64,85 +66,21 @@
},
{
"cell_type": "code",
"execution_count": 19,
"metadata": {},
"outputs": [],
"source": [
"# Prétraitement\n",
"def pretraitement(article):\n",
" # tokeniser par phrases\n",
" article_sentences = sent_tokenize(article)\n",
" article_ner_tokens = []\n",
" article_pos_tokens = []\n",
" compteur_phrase = 0\n",
" for sentence in article_sentences:\n",
" # Tokeniser\n",
" sentence_tokens = tok.tokenize(sentence)\n",
" # Assembler les entités nommées et colocations\n",
" sentence_ner = ner_tagger.tag(sentence_tokens)\n",
" ner_tokens = [ner_token for ner_token in sentence_ner if ner_token[1] != 'O']\n",
" # Supprimer les classes fermées avec un POS\n",
" sentence_pos = pos_tagger.tag(sentence_tokens)\n",
" pos_tokens = [pos_token for pos_token in sentence_pos if pos_token[1] in ['ADJ','ADV','INTJ','NOUN','PROPN','VERB']]\n",
" # Ajouter à la liste de phrases tokenisées\n",
" article_ner_tokens.append(ner_tokens)\n",
" article_pos_tokens.append(pos_tokens)\n",
" compteur_phrase += 1\n",
" return article_ner_tokens, article_pos_tokens"
]
},
{
"cell_type": "code",
"execution_count": 21,
"metadata": {},
"outputs": [],
"source": [
"article_pretraite = [pretraitement(x) for x in list(textes_articles_df[\"text\"])]"
]
},
{
"cell_type": "code",
"execution_count": 131,
"metadata": {},
"outputs": [],
"source": [
"def aggreger_ner_tags(article):\n",
" dict_named_entity = {}\n",
" for sentence in article[0]:\n",
" for entity in sentence:\n",
" dict_named_entity[entity] = dict_named_entity.get(entity,0) + 1\n",
" return dict_named_entity"
"article_pretraite = [pr.pretraitement(x,tok,ner_tagger,pos_tagger) for x in list(textes_articles_df[\"text\"])]"
]
},
{
"cell_type": "code",
"execution_count": 132,
"metadata": {},
"outputs": [],
"source": [
"def aggreger_pos_tags(article):\n",
" dict_pos = {}\n",
" for sentence in article[1]:\n",
" for pos in sentence:\n",
" dict_pos[pos] = dict_pos.get(pos,0) + 1\n",
" return dict_pos"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": 165,
"metadata": {},
"outputs": [],
"source": [
"textes_articles_df['ner_dict']=[aggreger_ner_tags(article) for article in article_pretraite]\n",
"textes_articles_df['pos_dict']=[aggreger_pos_tags(article) for article in article_pretraite]"
"textes_articles_df['ner_dict']=[pr.aggreger_ner_tags(article) for article in article_pretraite]\n",
"textes_articles_df['pos_dict']=[pr.aggreger_pos_tags(article) for article in article_pretraite]"
]
},
{


+ 59
- 1792
Traitement commentaires.ipynb
File diff suppressed because it is too large
View File


+ 5
- 5
commentaires.ipynb View File

@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -14,7 +14,7 @@
},
{
"cell_type": "code",
"execution_count": 2,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -23,7 +23,7 @@
},
{
"cell_type": "code",
"execution_count": 3,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -40,7 +40,7 @@
},
{
"cell_type": "code",
"execution_count": 4,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -49,7 +49,7 @@
},
{
"cell_type": "code",
"execution_count": 6,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [


+ 54
- 0
pretraitement.py View File

@@ -0,0 +1,54 @@
import emoji
from nltk.tokenize import sent_tokenize

# Prétraitement
def pretraitement(article,tok,ner_tagger,pos_tagger):
# tokeniser par phrases
article_sentences = sent_tokenize(article)
article_ner_tokens = []
article_pos_tokens = []
article_emoji_tokens = []
for sentence in article_sentences:
try:
if len(sentence) > 0:
# Tokeniser
sentence_tokens = tok.tokenize(sentence)
sentence_tokens = [emoji.demojize(token) for token in sentence_tokens if len(token)>0]
if len(sentence_tokens) > 0:
emoji_tokens = [(token,i) for i, token in enumerate(sentence_tokens,1) if token[0] == ":"]
sentence_tokens = [token for token in sentence_tokens if token[0] != ":"]
if len(sentence_tokens) > 0:
# Assembler les entités nommées et colocations
sentence_ner = ner_tagger.tag(sentence_tokens)
ner_tokens = [ner_token for ner_token in sentence_ner if ner_token[1] != 'O']
# Supprimer les classes fermées avec un POS
sentence_pos = pos_tagger.tag(sentence_tokens)
pos_tokens = [pos_token for pos_token in sentence_pos if pos_token[1] in ['ADJ','ADV','INTJ','NOUN','PROPN','VERB']]
# Ajouter à la liste de phrases tokenisées
article_ner_tokens.append(ner_tokens)
article_pos_tokens.append(pos_tokens)
article_emoji_tokens.append(emoji_tokens)
except:
pass
return article_ner_tokens, article_pos_tokens, article_emoji_tokens

def aggreger_ner_tags(article):
dict_named_entity = {}
for sentence in article[0]:
for entity in sentence:
dict_named_entity[entity] = dict_named_entity.get(entity,0) + 1
return dict_named_entity

def aggreger_pos_tags(article):
dict_pos = {}
for sentence in article[1]:
for pos in sentence:
dict_pos[pos] = dict_pos.get(pos,0) + 1
return dict_pos

def aggreger_emoji(article):
dict_emojis = {}
for sentence in article[2]:
for emoji,loc in sentence:
dict_emojis.setdefault(emoji, []).append(loc)
return dict_emojis

+ 6
- 35
textes_articles.ipynb View File

@@ -2,7 +2,7 @@
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
@@ -27,47 +27,18 @@
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": true,
"jupyter": {
"outputs_hidden": true
}
},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'textes_articles' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-4-b88ef33508d0>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtextes_articles_df\u001b[0m \u001b[0;34m=\u001b[0m \u001b[0mpd\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mDataFrame\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0mtextes_articles\u001b[0m\u001b[0;34m,\u001b[0m \u001b[0mcolumns\u001b[0m\u001b[0;34m=\u001b[0m\u001b[0;34m[\u001b[0m\u001b[0;34m'media'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'post_id'\u001b[0m\u001b[0;34m,\u001b[0m\u001b[0;34m'text'\u001b[0m\u001b[0;34m]\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mNameError\u001b[0m: name 'textes_articles' is not defined"
]
}
],
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"textes_articles_df = pd.DataFrame(textes_articles, columns=['media','post_id','text'])"
]
},
{
"cell_type": "code",
"execution_count": 5,
"execution_count": null,
"metadata": {},
"outputs": [
{
"ename": "NameError",
"evalue": "name 'textes_articles_df' is not defined",
"output_type": "error",
"traceback": [
"\u001b[0;31m---------------------------------------------------------------------------\u001b[0m",
"\u001b[0;31mNameError\u001b[0m Traceback (most recent call last)",
"\u001b[0;32m<ipython-input-5-cc028516ec1f>\u001b[0m in \u001b[0;36m<module>\u001b[0;34m\u001b[0m\n\u001b[0;32m----> 1\u001b[0;31m \u001b[0mtextes_articles_df\u001b[0m\u001b[0;34m.\u001b[0m\u001b[0mto_csv\u001b[0m\u001b[0;34m(\u001b[0m\u001b[0;34m\"textes_articles_df.csv\"\u001b[0m\u001b[0;34m)\u001b[0m\u001b[0;34m\u001b[0m\u001b[0;34m\u001b[0m\u001b[0m\n\u001b[0m",
"\u001b[0;31mNameError\u001b[0m: name 'textes_articles_df' is not defined"
]
}
],
"outputs": [],
"source": [
"textes_articles_df.to_csv(\"textes_articles_df.csv\")"
]


Loading…
Cancel
Save