Relations between tables

Published

January 27, 2025

Code

library(dm, warn.conflicts = FALSE)
library(htmltools)
library(DiagrammeRsvg)

source("R/utils.R")

EDA: Exploratory Data Analysis

Here we have room for improvement if we want to automate it.

I make a quick function:

Code

system("mkdir -p data/data_raw/zipped/")

tsv <- c("g_assignee_disambiguated.tsv",
            "g_inventor_disambiguated.tsv", 
            "g_location_disambiguated.tsv", 
            "g_patent.tsv")

purrr::walk(tsv, dl_me_raw_stuff)

this could be a pot. improvement: https://books.ropensci.org/targets/cloud-storage.html

Second step: get a sample of all of them.

I went with the first 5 rows.

# 1. create some repos
mkdir -p data/data_raw/unziped
mkdir -p data/data_sample/
# 2. I could find a way to unzip just the first rows
unzip data/data_raw/\*.zip -d data/data_raw/unziped
# 3. Get what we need
for file in data/data_raw/unziped/*.tsv ; do
        head -n5 "$file" >  "$file.head"
        echo "$file.head" 
done
# 4. Store it were we need it
mv data/data_raw/unziped/*.head data/data_sample
# 5. delete not needed and rename
rm -rf data/data_raw/unziped

Third: read everything in R.

Code

list_samples <- read_sample("data/data_sample/") 
names(list_samples)

 [1] "g_application.tsv.head"               
 [2] "g_assignee_disambiguated.tsv.head"    
 [3] "g_attorney_disambiguated.tsv.head"    
 [4] "g_botanic.tsv.head"                   
 [5] "g_cpc_at_issue.tsv.head"              
 [6] "g_cpc_current.tsv.head"               
 [7] "g_cpc_title.tsv.head"                 
 [8] "g_examiner_not_disambiguated.tsv.head"
 [9] "g_figures.tsv.head"                   
[10] "g_foreign_citation.tsv.head"          
[11] "g_foreign_priority.tsv.head"          
[12] "g_gov_interest_contracts.tsv.head"    
[13] "g_gov_interest_org.tsv.head"          
[14] "g_gov_interest.tsv.head"              
[15] "g_inventor_disambiguated.tsv.head"    
[16] "g_ipc_at_issue.tsv.head"              
[17] "g_location_disambiguated.tsv.head"    
[18] "g_other_reference.tsv.head"           
[19] "g_patent.tsv.head"                    
[20] "g_persistent_assignee.tsv.head"       
[21] "g_persistent_inventor.tsv.head"       
[22] "g_us_application_citation.tsv.head"   
[23] "g_us_patent_citation.tsv.head"        
[24] "g_uspc_at_issue.tsv.head"             
[25] "g_wipo_technology.tsv.head"

Forth: use DM to do a nice schema

Code

# simplify my list I went back and forth
i_data <- c("g_assignee_disambiguated.tsv.head",
            "g_inventor_disambiguated.tsv.head", 
            "g_location_disambiguated.tsv.head", 
            "g_patent.tsv.head",
            "g_cpc_current.tsv.head")

# yes dm can also just take a list of df
pot_schema_no_keys <- as_dm(list_samples[i_data])

Sadly I need to first test what are the primary and foreign key (PK and FK) and if I can trust the data about that: meaning reading a bunch of big csv and not only sampling to build the schema.

I am first trying to get:

county
year
nb of inventor
nb of assignee (organisation)
nb of patents → using assignee for localisation

dm offer high level functions that return tibble with result for all tables:

enum_pk_candidates(a_table)
and more targets function:

Code

(check_key(g_location_dis, location_id)) # a bit of strange behavior it return null on success?
# test if we have value in g_assignee_dis$location_id not in g_location_dis 
# tests if x is a subset of y
check_subset(g_assignee_dis, location_id, g_location_dis, location_id)
nrow(g_assignee_dis[g_assignee_dis$location_id == "",])
#  90823
nrow(g_assignee_dis)
# 8206092 
# small number does not have location 
check_subset(g_assignee_dis[!g_assignee_dis$location_id == "",], location_id, g_location_dis, location_id)
# return correct hence if we want use location as a fk for assignee we need to do that 
(check_key(g_location_dis, location_id)) 
## patent id 
check_subset(g_assignee_dis[!g_assignee_dis$location_id == "",], patent_id, g_patent, patent_id)
check_subset(g_inventor_dis, patent_id, g_patent, patent_id)
## cpc
dm::check_key(cpc, patent_id)

Slowly adding them:

Code

pot_schema_pk_keys <- 
    pot_schema_no_keys |>
        dm_add_pk(g_location_disambiguated.tsv.head, columns = location_id) |> 
        dm_add_pk(g_patent.tsv.head, columns = patent_id)

Adding Foreign keys:

Code

pot_schema_all_keys <- 
    pot_schema_pk_keys |>
     dm_add_fk(g_inventor_disambiguated.tsv.head, 
               location_id,
               g_location_disambiguated.tsv.head) |>
    dm_add_fk(g_assignee_disambiguated.tsv.head,
              location_id,
              g_location_disambiguated.tsv.head) |> 
    dm_add_fk(g_assignee_disambiguated.tsv.head,
              patent_id,
              g_patent.tsv.head) |>
    dm_add_fk(g_inventor_disambiguated.tsv.head,
              patent_id,
              g_patent.tsv.head) |>
    dm_add_fk(g_cpc_current.tsv.head, patent_id,
              g_patent.tsv.head)

Five: Try to render that schema:

Code

schema <- dm_draw(pot_schema_all_keys, view_type = 'all')

# kind of hacky 
schema_html <- DiagrammeRsvg::export_svg(schema)
htmltools::HTML(schema_html )

Producing a summary table

Sixth: given that logic build a summary table by county/year

We will probably do it in at least to n steps:

Getting counties, right now we do not know if we are missing some counties and we are still unsure what vintage of census patentsview is using.
Linking patent to assignee then link to location
Linking patent to inventor then link to location