Relations between tables

Published

September 5, 2024

Code
library(dm, warn.conflicts = FALSE)
library(htmltools)
library(DiagrammeRsvg)

source("R/utils.R")

EDA: Exploratory Data Analysis

Here we have room for improvement if we want to automate it.

I make a quick function:

Code
source("R/utils.R")

system("mkdir -p data/data_raw/unzipped")

tsv <- c("g_assignee_disambiguated.tsv",
            "g_inventor_disambiguated.tsv", 
            "g_location_disambiguated.tsv", 
            "g_patent.tsv")

purrr::walk(tsv, dl_me_raw_stuff)
  • this could be a pot. improvement: https://books.ropensci.org/targets/cloud-storage.html

Second step: get a sample of all of them.

I went with the first 5 rows.

# 1. create some repos
mkdir -p data/data_raw/unziped
mkdir -p data/data_sample/
# 2. I could find a way to unzip just the first rows
unzip data/data_raw/\*.zip -d data/data_raw/unziped
# 3. Get what we need
for file in data/data_raw/unziped/*.tsv ; do
        head -n5 "$file" >  "$file.head"
        echo "$file.head" 
done
# 4. Store it were we need it
mv data/data_raw/unziped/*.head data/data_sample
# 5. delete not needed and rename
rm -rf data/data_raw/unziped

Third: read everything in R.

Code
list_samples <- read_sample("data/data_sample/") 
names(list_samples)
 [1] "g_application.tsv.head"               
 [2] "g_assignee_disambiguated.tsv.head"    
 [3] "g_attorney_disambiguated.tsv.head"    
 [4] "g_botanic.tsv.head"                   
 [5] "g_cpc_at_issue.tsv.head"              
 [6] "g_cpc_current.tsv.head"               
 [7] "g_cpc_title.tsv.head"                 
 [8] "g_examiner_not_disambiguated.tsv.head"
 [9] "g_figures.tsv.head"                   
[10] "g_foreign_citation.tsv.head"          
[11] "g_foreign_priority.tsv.head"          
[12] "g_gov_interest_contracts.tsv.head"    
[13] "g_gov_interest_org.tsv.head"          
[14] "g_gov_interest.tsv.head"              
[15] "g_inventor_disambiguated.tsv.head"    
[16] "g_ipc_at_issue.tsv.head"              
[17] "g_location_disambiguated.tsv.head"    
[18] "g_other_reference.tsv.head"           
[19] "g_patent.tsv.head"                    
[20] "g_persistent_assignee.tsv.head"       
[21] "g_persistent_inventor.tsv.head"       
[22] "g_us_application_citation.tsv.head"   
[23] "g_us_patent_citation.tsv.head"        
[24] "g_uspc_at_issue.tsv.head"             
[25] "g_wipo_technology.tsv.head"           

Forth: use DM to do a nice schema

Code
# simplify my list I went back and forth
i_data <- c("g_assignee_disambiguated.tsv.head",
            "g_inventor_disambiguated.tsv.head", 
            "g_location_disambiguated.tsv.head", 
            "g_patent.tsv.head",
            "g_cpc_current.tsv.head")

# yes dm can also just take a list of df
pot_schema_no_keys <- as_dm(list_samples[i_data])

Sadly I need to first test what are the primary and foreign key (PK and FK) and if I can trust the data about that: meaning reading a bunch of big csv and not only sampling to build the schema.

I am first trying to get:

  • county

  • year

  • nb of inventor

  • nb of assignee (organisation)

  • nb of patents → using assignee for localisation

dm offer high level functions that return tibble with result for all tables:

  • enum_pk_candidates(a_table)
  • and more targets function:
Code
(check_key(g_location_dis, location_id)) # a bit of strange behavior it return null on success?
# test if we have value in g_assignee_dis$location_id not in g_location_dis 
# tests if x is a subset of y
check_subset(g_assignee_dis, location_id, g_location_dis, location_id)
nrow(g_assignee_dis[g_assignee_dis$location_id == "",])
#  90823
nrow(g_assignee_dis)
# 8206092 
# small number does not have location 
check_subset(g_assignee_dis[!g_assignee_dis$location_id == "",], location_id, g_location_dis, location_id)
# return correct hence if we want use location as a fk for assignee we need to do that 
(check_key(g_location_dis, location_id)) 
## patent id 
check_subset(g_assignee_dis[!g_assignee_dis$location_id == "",], patent_id, g_patent, patent_id)
check_subset(g_inventor_dis, patent_id, g_patent, patent_id)
## cpc
dm::check_key(cpc, patent_id)

Slowly adding them:

Code
pot_schema_pk_keys <- 
    pot_schema_no_keys |>
        dm_add_pk(g_location_disambiguated.tsv.head, columns = location_id) |> 
        dm_add_pk(g_patent.tsv.head, columns = patent_id)

Adding Foreign keys:

Code
pot_schema_all_keys <- 
    pot_schema_pk_keys |>
     dm_add_fk(g_inventor_disambiguated.tsv.head, 
               location_id,
               g_location_disambiguated.tsv.head) |>
    dm_add_fk(g_assignee_disambiguated.tsv.head,
              location_id,
              g_location_disambiguated.tsv.head) |> 
    dm_add_fk(g_assignee_disambiguated.tsv.head,
              patent_id,
              g_patent.tsv.head) |>
    dm_add_fk(g_inventor_disambiguated.tsv.head,
              patent_id,
              g_patent.tsv.head) |>
    dm_add_fk(g_cpc_current.tsv.head, patent_id,
              g_patent.tsv.head)

Five: Try to render that schema:

Code
schema <- dm_draw(pot_schema_all_keys, view_type = 'all')

# kind of hacky 
schema_html <- DiagrammeRsvg::export_svg(schema)
htmltools::HTML(schema_html )
%0 g_assignee_disambiguated.tsv.head g_assignee_disambiguated.tsv.head patent_id assignee_sequence assignee_id disambig_assignee_individual_name_first disambig_assignee_individual_name_last disambig_assignee_organization assignee_type location_id g_location_disambiguated.tsv.head g_location_disambiguated.tsv.head location_id disambig_city disambig_state disambig_country latitude longitude county state_fips county_fips g_assignee_disambiguated.tsv.head:location_id->g_location_disambiguated.tsv.head:location_id g_patent.tsv.head g_patent.tsv.head patent_id patent_type patent_date patent_title wipo_kind num_claims withdrawn filename g_assignee_disambiguated.tsv.head:patent_id->g_patent.tsv.head:patent_id g_cpc_current.tsv.head g_cpc_current.tsv.head patent_id cpc_sequence cpc_section cpc_class cpc_subclass cpc_group cpc_type g_cpc_current.tsv.head:patent_id->g_patent.tsv.head:patent_id g_inventor_disambiguated.tsv.head g_inventor_disambiguated.tsv.head patent_id inventor_sequence inventor_id disambig_inventor_name_first disambig_inventor_name_last gender_code location_id g_inventor_disambiguated.tsv.head:location_id->g_location_disambiguated.tsv.head:location_id g_inventor_disambiguated.tsv.head:patent_id->g_patent.tsv.head:patent_id

Producing a summary table

Sixth: given that logic build a summary table by county/year

We will probably do it in at least to n steps:

  • Getting counties, right now we do not know if we are missing some counties and we are still unsure what vintage of census patentsview is using.

  • Linking patent to assignee then link to location

  • Linking patent to inventor then link to location