Browse Source

préparation des données

master
FRANCOIS PELLETIER 9 months ago
parent
commit
01b0f0bd92
2 changed files with 163 additions and 0 deletions
  1. +4
    -0
      .gitignore
  2. +159
    -0
      Preparation_BD.Rmd

+ 4
- 0
.gitignore View File

@@ -0,0 +1,4 @@
.Rproj.user
.Rhistory
.RData
.Ruserdata

+ 159
- 0
Preparation_BD.Rmd View File

@@ -0,0 +1,159 @@
---
title: "Presentation Sentometrics"
author: "François Pelletier"
date: "06/10/2019"
output: html_document
---

```{r setup, include=FALSE}
knitr::opts_chunk$set(echo = TRUE)
```

```{r}
library("jsonlite")
library("tidyverse")
library("RSQLite")
library("DBI")
library("lubridate")
```


```{r}
blog_exemple <- jsonlite::read_json("google_news_blogs/blogs/blogs_0000001.json")
```

- Identifiant

```{r}
blog_exemple$uuid
```

- Date

```{r}
blog_exemple$published
```

- Contenu

```{r}
blog_exemple$text
```

- Features
- Persons
```{r}
blog_exemple$entities$persons %>% sapply(FUN = function(x) x$name)
```
- Organizations
```{r}
blog_exemple$entities$organizations %>% sapply(FUN = function(x) x$name)
```
- Locations
```{r}
blog_exemple$entities$locations %>% sapply(FUN = function(x) x$name)
```

## Dataframes

Core

```{r}
extract_names <- function(list_entities){
name_entities <- list_entities %>% sapply(FUN = function(x) x$name)
if (length(name_entities) > 0)
return(name_entities)
else
return(NA)
}

generer_core_df <- function(json_contents){
tibble(uuid = json_contents$uuid %>% coalesce(""),
site = json_contents$thread$site %>% coalesce(""),
site_type = json_contents$thread$site_type %>% coalesce(""),
country = json_contents$thread$country %>% coalesce(""),
published = lubridate::as_datetime(json_contents$thread$published) %>% coalesce(ISOdate(1900,1,1)),
title_full = json_contents$thread$title_full %>% coalesce(""),
text = json_contents$text %>% coalesce(""))
}

generer_entities_df <- function(json_contents){
this_df <- bind_rows(tibble(uuid = json_contents$uuid,
entity_type="persons",
entity=json_contents$entities$persons %>%
extract_names) ,
tibble(uuid = json_contents$uuid,
entity_type="organizations",
entity=json_contents$entities$organizations %>%
extract_names),
tibble(uuid = json_contents$uuid,
entity_type="locations",
entity=json_contents$entities$locations %>%
extract_names))
this_df <- na.omit(this_df)
}
```

```{r}
core_df <- generer_core_df(blog_exemple)
core_df %>% glimpse
```

```{r}
entities_df <- generer_entities_df(blog_exemple)
entities_df %>% glimpse
```



## Création des schémas de la base de données

```{r}
if(file.exists("google_news.sqlite"))
file.remove("google_news.sqlite")
con = dbConnect(drv = RSQLite::SQLite(), dbname="google_news.sqlite")
dbCreateTable(con,"core",core_df)
dbCreateTable(con,"entities",entities_df)
```

## Importation des données

```{r}
file_blogs <- list.files(path = "google_news_blogs/blogs",pattern = "*.json",full.names = TRUE)
file_news <- list.files(path = "google_news_blogs/news",pattern = "*.json",full.names = TRUE)
```

```{r}

traiter_json <- function(file_path){

json_contents <- jsonlite::read_json(file_path)
core_df <- generer_core_df(json_contents)
entities_df <- generer_entities_df(json_contents)
dbAppendTable(con,"core",core_df)
dbAppendTable(con,"entities",entities_df)
}

i <- 0 # itérateur
for (file_blog in file_blogs){
if(!(i %% 1000)){
print(paste0(i,": Traitement de ",file_blog))
}
traiter_json(file_blog)
i <- i+1
}

```


```{r}
ii <- 0 # itérateur
for (file_article in file_news){
if(!(ii %% 1000)){
print(paste0(ii,": Traitement de ",file_article))
}
traiter_json(file_article)
ii <- ii+1
}
```


Loading…
Cancel
Save