Présentation du package sentometrics d'analyse chronologique de sentiments
You can not select more than 25 topics Topics must start with a letter or number, can include dashes ('-') and can be up to 35 characters long.

161 lines
3.6KB

  1. ---
  2. title: "Préparation des données Sentometrics"
  3. author: "François Pelletier"
  4. date: "06/10/2019"
  5. output: html_document
  6. ---
  7. ```{r setup, include=FALSE}
  8. knitr::opts_chunk$set(echo = TRUE)
  9. ```
  10. ```{r}
  11. library("jsonlite")
  12. library("tidyverse")
  13. library("RSQLite")
  14. library("DBI")
  15. library("lubridate")
  16. ```
  17. ```{r}
  18. blog_exemple <- jsonlite::read_json("google_news_blogs/blogs/blogs_0000001.json")
  19. ```
  20. - Identifiant
  21. ```{r}
  22. blog_exemple$uuid
  23. ```
  24. - Date
  25. ```{r}
  26. blog_exemple$published
  27. ```
  28. - Contenu
  29. ```{r}
  30. blog_exemple$text
  31. ```
  32. - Features
  33. - Persons
  34. ```{r}
  35. blog_exemple$entities$persons %>% sapply(FUN = function(x) x$name)
  36. ```
  37. - Organizations
  38. ```{r}
  39. blog_exemple$entities$organizations %>% sapply(FUN = function(x) x$name)
  40. ```
  41. - Locations
  42. ```{r}
  43. blog_exemple$entities$locations %>% sapply(FUN = function(x) x$name)
  44. ```
  45. ## Dataframes
  46. Core
  47. ```{r}
  48. extract_names <- function(list_entities){
  49. name_entities <- list_entities %>% sapply(FUN = function(x) x$name)
  50. if (length(name_entities) > 0)
  51. return(name_entities)
  52. else
  53. return(NA)
  54. }
  55. generer_core_df <- function(json_contents){
  56. tibble(uuid = json_contents$uuid %>% coalesce(""),
  57. site = json_contents$thread$site %>% coalesce(""),
  58. site_type = json_contents$thread$site_type %>% coalesce(""),
  59. country = json_contents$thread$country %>% coalesce(""),
  60. published = lubridate::as_datetime(json_contents$thread$published) %>% coalesce(ISOdate(1900,1,1)),
  61. title_full = json_contents$thread$title_full %>% coalesce(""),
  62. text = json_contents$text %>% coalesce(""))
  63. }
  64. generer_entities_df <- function(json_contents){
  65. this_df <- bind_rows(tibble(uuid = json_contents$uuid,
  66. entity_type="persons",
  67. entity=json_contents$entities$persons %>%
  68. extract_names) ,
  69. tibble(uuid = json_contents$uuid,
  70. entity_type="organizations",
  71. entity=json_contents$entities$organizations %>%
  72. extract_names),
  73. tibble(uuid = json_contents$uuid,
  74. entity_type="locations",
  75. entity=json_contents$entities$locations %>%
  76. extract_names))
  77. this_df <- na.omit(this_df)
  78. }
  79. ```
  80. ```{r}
  81. core_df <- generer_core_df(blog_exemple)
  82. core_df %>% glimpse
  83. ```
  84. ```{r}
  85. entities_df <- generer_entities_df(blog_exemple)
  86. entities_df %>% glimpse
  87. ```
  88. ## Création des schémas de la base de données
  89. ```{r}
  90. if(file.exists("google_news.sqlite"))
  91. file.remove("google_news.sqlite")
  92. con = dbConnect(drv = RSQLite::SQLite(), dbname="google_news.sqlite")
  93. dbCreateTable(con,"core",core_df)
  94. dbCreateTable(con,"entities",entities_df)
  95. ```
  96. ## Importation des données
  97. ```{r}
  98. file_blogs <- list.files(path = "google_news_blogs/blogs",pattern = "*.json",full.names = TRUE)
  99. file_news <- list.files(path = "google_news_blogs/news",pattern = "*.json",full.names = TRUE)
  100. ```
  101. ```{r}
  102. traiter_json <- function(file_path){
  103. json_contents <- jsonlite::read_json(file_path)
  104. core_df <- generer_core_df(json_contents)
  105. entities_df <- generer_entities_df(json_contents)
  106. dbAppendTable(con,"core",core_df)
  107. dbAppendTable(con,"entities",entities_df)
  108. }
  109. ```
  110. # Traitement des fichiers
  111. ```{r eval=FALSE}
  112. i <- 0 # itérateur
  113. for (file_blog in file_blogs){
  114. if(!(i %% 1000)){
  115. print(paste0(i,": Traitement de ",file_blog))
  116. }
  117. traiter_json(file_blog)
  118. i <- i+1
  119. }
  120. ii <- 0 # itérateur
  121. for (file_article in file_news){
  122. if(!(ii %% 1000)){
  123. print(paste0(ii,": Traitement de ",file_article))
  124. }
  125. traiter_json(file_article)
  126. ii <- ii+1
  127. }
  128. ```