24  Raspagem Google Patents

Baixar lista de patentes a serem raspadas no Google Patents:

TI=(biogas) OR AB=(biogas) OR CL=(biogas) country:BR

e salvá-las como: gp-search-20230830-124212.csv

Google Patents Download

24.1 Função de raspagem: sniff_google_patents

code/raspagem-google-patents/utils.R
sniff_google_patents <- function(url, show_progress = F) {

  if (show_progress == T) print(paste(format(Sys.time(), '%Y-%m-%d %H:%M:%S'), url, sep = ' '))

  ## Firefox: Inspect Browser with Disabled JavaScript
  # - Enter about:config into the search bar and select Accept the Risk and Continue.
  # - Enter javascript.enabled into the search box at the top of the page.
  # - Select the javascript.enabled toggle to change the value to false.

  # -----
  # abstract
  rvest::read_html(url) |>
    rvest::html_element('xpath' = '//abstract') ->
    abstract_path

  if (class(abstract_path) == 'xml_node') { 
    abstract_path |> 
      rvest::html_text2() -> 
      abstract 
  } else { abstract <- NA }

  # -----
  # claims
  rvest::read_html(url) |>
    rvest::html_element('xpath' = '//claims') ->
    claims_path

  if (class(claims_path) == 'xml_node') { 
    claims_path |> 
      rvest::html_text2() -> 
      claims 
  } else { claims <- NA }

  # -----
  # Cited By: sem javascript
  # Families Citing this family: com javascript
  rvest::read_html(url) |>
    rvest::html_element('xpath' = '//*[contains (text(), "Families Citing this family")]//following::table/thead') ->
    cited_by_path

  if (class(cited_by_path) == 'xml_node') { 
    cited_by_path |> 
      rvest::html_table() |>
      janitor::clean_names() ->
      thead_cited_by

    rvest::read_html(url) |> 
      rvest::html_element('xpath' = '//*[contains (text(), "Families Citing this family")]//following::table/tbody') |> 
      rvest::html_table() |>
      setNames(names(thead_cited_by)) |>
      dplyr::mutate(publication_number = gsub('\n.*$', '', publication_number)) ->
      cited_by

  } else { cited_by <- NA }

  # -----
  # Family Cites Families: sem javascript
  # Patent Citations: com javascript 
  rvest::read_html(url) |>
    rvest::html_element('xpath' = '//*[contains (text(), "Family Cites Families")]//following::table/thead') ->
    patent_citations_path

  if (class(patent_citations_path) == 'xml_node') { 
    patent_citations_path |> 
      rvest::html_table() |>
      janitor::clean_names() ->
      thead_patent_citations

    rvest::read_html(url) |>
      rvest::html_element('xpath' = '//*[contains (text(), "Family Cites Families")]//following::table/tbody') |>
      rvest::html_table() |>
      setNames(names(thead_patent_citations)) |>
      dplyr::mutate(publication_number = gsub('\n.*$', '', publication_number)) ->
      patent_citations
  
  } else { patent_citations <- NA }

  # -----
  # Classifications: sem javascript
  rvest::read_html(url) |>
    rvest::html_element('xpath' = '//*[contains (text(), "Classifications")]//following::*') |>
    as.character() ->
    ipc_text

  if (!is.null(ipc_text)) { 
    stringr::str_locate_all(ipc_text, '[A-Z]{1}[0-9]{2}[A-Z]{1}[0-9]{2}\\/[0-9]{2}') |>
      {\(x) stringr::str_sub(ipc_text, x[[1]]) }() ->
      ipc
  } else { ipc_code <- NA }

  # -----
  list(url = url, 
       abstract = abstract, 
       claims = claims, 
       cited_by = cited_by, 
       patent_citations = patent_citations, 
       ipc = ipc)
}

24.2 Raspagem

Código
library(tidyverse) 
library(stringr) 
library(purrr) 
library(furrr)
library(tictoc) 

# carregar função sniff_google_patent
source('code/raspagem-google-patents/utils.R')

# importar lista de patentes a serem raspadas
readr::read_csv('code/raspagem-google-patents/gp-search-20230830-124212.csv', skip = 1) |>
  tibble::tibble() |>
  janitor::clean_names() |>
  dplyr::mutate(id = gsub('[[:punct:]]', '', id)) |> 
  dplyr::mutate(id = stringr::str_trim(id)) -> 
  pat

dplyr::glimpse(pat)
Rows: 511
Columns: 10
$ id                         <chr> "BR112018012788B1", "BR112012028650B1", "BR…
$ title                      <chr> "METHOD AND FACILITY TO PRODUCE BIOMETHANE …
$ assignee                   <chr> "Waga Energy", "Solvay Sa", "Herbst Umweltt…
$ inventor_author            <chr> "Guénaël PRINCE, Mathieul Lefebvre, Pierre …
$ priority_date              <date> 2015-12-24, 2010-05-10, 2017-01-30, 2005-0…
$ filing_creation_date       <date> 2016-11-10, 2011-05-10, 2018-01-29, 2006-0…
$ publication_date           <date> 2022-12-13, 2019-04-30, 2020-03-17, 2011-0…
$ grant_date                 <lgl> NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,…
$ result_link                <chr> "https://patents.google.com/patent/BR112018…
$ representative_figure_link <chr> "https://patentimages.storage.googleapis.co…
Código
# executando função: sniff_google_patents
tictoc::tic()
purrr::map(pat$result_link, sniff_google_patents, .progress = T) -> gp_raspado
tictoc::toc() # 54 minutos

# adicionar id das patentes
names(gp_raspado) <- pat$id

54 minutos utilizando 1 núcleo.

24.3 Raspagem multiprocessada

Multiprocessando utilizando 28 núcleos.

Código
# configurando o multiprocessamento
library(furrr) 
plan(multisession, workers = 28)

# executando função: sniff_google_patents
tictoc::tic()
furrr::future_map(pat$result_link, sniff_google_patents, .progress = T) -> gp_raspado2
tictoc::toc() # 58 segundos

# adicionar id das patentes
names(gp_resultado) <- pat$id

58 segundos utilizando 28 núcleos.