Difference between revisions of "Harvard Dataverse"

From edegan.com
Jump to navigation Jump to search
imported>Leo
 
(24 intermediate revisions by 3 users not shown)
Line 1: Line 1:
This page records how to load/use the Harvard Dataverse.
+
 
 +
Return to [[Patent Data (Wiki Page)]].
 +
<section begin=dataverse />
 +
The Harvard Dataverse provides clean versions of the U.S. utility patent datasets spanning 1975-2010. The data is post author disambiguation. <section end=dataverse /> For details, see the paper at [https://dataverse.harvard.edu/dataset.xhtml?persistentId=hdl:1902.1/15705 Harvard Dataverse].
 +
 
 +
This page records how to load and use the Harvard Dataverse. The patents from 1975-2010 loaded as .sqlite3 and csv files can be found at
 +
[https://dataverse.harvard.edu/dataset.xhtml?persistentId=hdl:1902.1/15705 Harvard Dataverse]. All of the files have been downloaded to the database serverr and can be found at cd/bulk/patent.  
  
 
==Getting the data==
 
==Getting the data==
Line 7: Line 13:
  
 
For more information about the patent data:
 
For more information about the patent data:
[[Patent Data]]
+
[[Patent Data (Wiki Page)]]
 +
 
 +
To recreate the tables:
 +
 
 +
1. run createtables.sql
 +
 
 +
2. run copytables.sql
 +
 
 +
3. run cleaning db.sql
 +
 
 +
These scripts are available under /bulk/Software/Database\ Scripts
  
 
==Loading the tables into the database==
 
==Loading the tables into the database==
Line 198: Line 214:
 
     CitSeq integer
 
     CitSeq integer
 
  );
 
  );
 
I have not processed the following tables:
 
  
 
  CREATE TABLE usreldocs (
 
  CREATE TABLE usreldocs (
     Patent varchar,
+
     Patent integer,
 
     DocType varchar,
 
     DocType varchar,
 
     OrderSeq integer,
 
     OrderSeq integer,
Line 208: Line 222:
 
     RelPatent varchar,
 
     RelPatent varchar,
 
     Kind varchar,
 
     Kind varchar,
     RelDate varchar,
+
     RelDate date,
 
     Status varchar
 
     Status varchar
 
  );
 
  );
  
  CREATE TABLE invpat (
+
  CREATE TABLE invpats (
 
     Firstname varchar,
 
     Firstname varchar,
 
     Lastname varchar,
 
     Lastname varchar,
Line 223: Line 237:
 
     Lon double precision,
 
     Lon double precision,
 
     InvSeq integer,
 
     InvSeq integer,
     Patent varchar,
+
     Patent integer,
 
     GYear integer,
 
     GYear integer,
     AppYear varchar,
+
     AppYear integer,
     AppDate varchar,
+
     AppDate date,
 
     Assignee varchar,
 
     Assignee varchar,
 
     AsgNum varchar,
 
     AsgNum varchar,
Line 234: Line 248:
 
     Up varchar
 
     Up varchar
 
  );
 
  );
 +
 +
==Some statistics for the Patent table==
 +
SELECT gyear, COUNT(*) FROM patents GROUP BY gyear ORDER BY gyear;
 +
gyear | count 
 +
-------+--------
 +
  1975 |  72000
 +
  1976 |  70226
 +
  1977 |  65269
 +
  1978 |  66102
 +
  1979 |  48854
 +
  1980 |  61819
 +
  1981 |  65771
 +
  1982 |  57888
 +
  1983 |  56860
 +
  1984 |  67200
 +
  1985 |  71661
 +
  1986 |  70860
 +
  1987 |  82952
 +
  1988 |  77924
 +
  1989 |  95537
 +
  1990 |  90364
 +
  1991 |  96513
 +
  1992 |  97444
 +
  1993 |  98342
 +
  1994 | 101676
 +
  1995 | 101419
 +
  1996 | 109645
 +
  1997 | 111984
 +
  1998 | 147574
 +
  1999 | 153593
 +
  2000 | 157596
 +
  2001 | 166065
 +
  2002 | 167424
 +
  2003 | 169105
 +
  2004 | 164413
 +
  2005 | 143927
 +
  2006 | 173922
 +
  2007 | 157502
 +
  2008 | 157894
 +
  2009 | 167537
 +
  2010 | 219909
 +
(36 rows)
 +
 +
SELECT appyear, COUNT(*) FROM patents GROUP BY appyear ORDER BY appyear;
 +
appyear | count 
 +
---------+--------
 +
    1901 |      3
 +
    1902 |      1
 +
    1903 |      4
 +
    1904 |      4
 +
    1905 |      1
 +
    1908 |      1
 +
    1909 |      1
 +
    1915 |      1
 +
    1918 |      1
 +
    1921 |      1
 +
    1922 |      1
 +
    1925 |      1
 +
    1929 |      1
 +
    1930 |      2
 +
    1931 |      2
 +
    1933 |      1
 +
    1936 |      2
 +
    1938 |      4
 +
    1939 |      1
 +
    1940 |      3
 +
    1941 |      3
 +
    1942 |      1
 +
    1943 |      4
 +
    1944 |    19
 +
    1945 |    18
 +
    1946 |    11
 +
    1947 |      4
 +
    1948 |      8
 +
    1949 |    10
 +
    1950 |      9
 +
    1951 |    11
 +
    1952 |    17
 +
    1953 |    12
 +
    1954 |    30
 +
    1955 |    33
 +
    1956 |    33
 +
    1957 |    32
 +
    1958 |    39
 +
    1959 |    56
 +
    1960 |    73
 +
    1961 |    85
 +
    1962 |    111
 +
    1963 |    110
 +
    1964 |    147
 +
    1965 |    167
 +
    1966 |    193
 +
    1967 |    230
 +
    1968 |    337
 +
    1969 |    584
 +
    1970 |  1161
 +
    1971 |  2658
 +
    1972 |  9418
 +
    1973 |  41655
 +
    1974 |  65683
 +
    1975 |  65902
 +
    1976 |  65813
 +
    1977 |  65999
 +
    1978 |  65613
 +
    1979 |  65729
 +
    1980 |  66500
 +
    1981 |  63930
 +
    1982 |  65029
 +
    1983 |  61578
 +
    1984 |  67085
 +
    1985 |  71463
 +
    1986 |  75108
 +
    1987 |  81499
 +
    1988 |  90178
 +
    1989 |  96139
 +
    1990 |  99372
 +
    1991 | 100262
 +
    1992 | 103926
 +
    1993 | 108346
 +
    1994 | 123358
 +
    1995 | 144655
 +
    1996 | 144864
 +
    1997 | 169578
 +
    1998 | 168471
 +
    1999 | 180655
 +
    2000 | 196639
 +
    2001 | 208908
 +
    2002 | 206229
 +
    2003 | 191514
 +
    2004 | 181840
 +
    2005 | 166524
 +
    2006 | 139318
 +
    2007 |  97911
 +
    2008 |  48539
 +
    2009 |  12245
 +
    2010 |  1052
 +
        |      2
 +
(91 rows)
 +
 +
 +
patentdata=# SELECT COUNT(*) FROM patents;
 +
  count 
 +
---------
 +
3984771
 +
(1 row)
 +
 +
patentdata=# SELECT kind, COUNT(*) FROM patents GROUP BY kind ORDER BY kind;
 +
kind |  count 
 +
------+---------
 +
      | 2302931
 +
A1  |      1
 +
B1  |  561885
 +
B2  | 1119954
 +
(4 rows)
 +
 +
SELECT p.patent, COUNT(c.patent) AS numcited FROM patents AS p, CITATIONS AS c where p.patent=c.citation GROUP BY p.patent ORDER BY numcited DESC;
 +
patent  | numcited
 +
---------+--------------
 +
4683202 |        1992
 +
4723129 |        1935
 +
4683195 |        1814
 +
4463359 |        1676
 +
4740796 |        1638
 +
4558333 |        1558
 +
4345262 |        1537
 +
4313124 |        1504
 +
4459600 |        1461
 +
4733665 |        1286
 +
5103459 |        1118
 +
5572643 |        1018
 +
4901307 |          959
 +
5143854 |          924
 +
5523520 |          918
 +
5643826 |          883
 +
4655771 |          861
 +
4340563 |          808
 +
5742905 |          792
 +
5892900 |          780
 +
4799156 |          771
 +
4816567 |          751
 +
5172338 |          749
 +
4886062 |          745
 +
4776337 |          744
 +
4965188 |          742
 +
4800882 |          726
 +
4580568 |          706
 +
5710887 |          697
 +
4665906 |          697
 +
3953566 |          681
 +
5056109 |          678
 +
4405829 |          666
 +
4739762 |          665
 +
4503569 |          662
 +
4608577 |          659
 +
4179337 |          642
 +
5794207 |          636
 +
5064435 |          635
 +
5530852 |          630
 +
5272236 |          629
 +
4100324 |          625
 +
5708780 |          623
 +
5608786 |          620
 +
5109390 |          618
 +
5715314 |          616
 +
5923962 |          610
 +
5101501 |          608
 +
5774660 |          607
 +
4908112 |          607
 +
 +
[[Category:Internal]]
 +
[[Internal Classification::Legacy| ]]

Latest revision as of 16:26, 21 March 2017

Return to Patent Data (Wiki Page).

The Harvard Dataverse provides clean versions of the U.S. utility patent datasets spanning 1975-2010. The data is post author disambiguation.

For details, see the paper at Harvard Dataverse.

This page records how to load and use the Harvard Dataverse. The patents from 1975-2010 loaded as .sqlite3 and csv files can be found at Harvard Dataverse. All of the files have been downloaded to the database serverr and can be found at cd/bulk/patent.

Getting the data

Where we get the Harvard Dataverse data: Harvard Dataverse

For more information about the patent data: Patent Data (Wiki Page)

To recreate the tables:

1. run createtables.sql

2. run copytables.sql

3. run cleaning db.sql

These scripts are available under /bulk/Software/Database\ Scripts

Loading the tables into the database

COPY classes FROM '/tmp/class.csv' DELIMITER ',' NULL  CSV;
COPY assignees FROM '/tmp/assignee.csv' DELIMITER ',' NULL  CSV;
COPY citations FROM '/tmp/citation.csv' DELIMITER ',' NULL  CSV;
COPY inventors FROM '/tmp/inventor.csv' DELIMITER ',' NULL  CSV;
COPY patents FROM '/tmp/patent.csv' DELIMITER ',' NULL  CSV;
COPY patdescs FROM '/tmp/patdesc.csv' DELIMITER ',' NULL  CSV;
COPY lawyers FROM '/tmp/lawyer.csv' DELIMITER ',' NULL  CSV;
COPY scirefs FROM '/tmp/sciref.csv' DELIMITER ',' NULL  CSV;
COPY usreldocs FROM '/tmp/usreldoc.csv' DELIMITER ',' NULL  CSV;
COPY invpats FROM '/tmp/invpat.csv' DELIMITER ',' NULL  CSV;

Accessing the dbase

On the database server, the entire Harvard Dataverse for Patent Data is downloaded in directory

 /bulk/patent

The files have been processed to be used for the create table scripts I have created to load the Harvard Dataverse database into our own psql database.

These preprocessed files are available under

/bulk/patent/preprocessed

The database I have created for the Patent Data from 1975-2010 is called patent data. After logging in to researcher@128.42.44.181, you can get access to the database by

psql patentdata

You can see a list of the tables inside the patent database by

patentdata=# \dt
           List of relations
Schema |   Name    | Type  |   Owner    
--------+-----------+-------+------------
public | assignees | table | researcher
public | citations | table | researcher
public | classes   | table | researcher
public | inventors | table | researcher
public | invpat    | table | researcher
public | lawyers   | table | researcher
public | patdesc   | table | researcher
public | patents   | table | researcher
public | scirefs   | table | researcher
public | usreldocs | table | researcher
(10 rows)

You can select from the database, for example, by

patentdata=# SELECT * FROM CLASSES WHERE patent='03930270';
 patent  | prim | class | subclass 
----------+------+-------+----------
03930270 |    1 | 360   | 130.24
03930270 |    0 | 360   | 84
03930270 |    0 | 360   | 271.8
03930270 |    0 | G9B   | 15.08
(4 rows)

Or, do a join:

patentdata=# SELECT * FROM inventors AS i, classes AS c WHERE i.Patent=c.Patent AND i.Patent='03930270';
 patent  | firstname | lastname | street |   city    | state | country | zipcode | nationality | invseq |  patent  | prim | class | subclass 
----------+-----------+----------+--------+-----------+-------+---------+---------+-------------+--------+----------+------+-------+----------
03930270 | YOSHINORI | YAMAZOE  |        | OSAKA     |       | JP      |         |             |      3 | 03930270 |    1 | 360   | 130.24
03930270 | YOSHINORI | YAMAZOE  |        | OSAKA     |       | JP      |         |             |      3 | 03930270 |    0 | 360   | 84
03930270 | YOSHINORI | YAMAZOE  |        | OSAKA     |       | JP      |         |             |      3 | 03930270 |    0 | 360   | 271.8
03930270 | YOSHINORI | YAMAZOE  |        | OSAKA     |       | JP      |         |             |      3 | 03930270 |    0 | G9B   | 15.08
03930270 | CHIHIRO   | NAKAMURA |        | MORIGUCHI |       | JP      |         |             |      4 | 03930270 |    1 | 360   | 130.24
03930270 | CHIHIRO   | NAKAMURA |        | MORIGUCHI |       | JP      |         |             |      4 | 03930270 |    0 | 360   | 84
03930270 | CHIHIRO   | NAKAMURA |        | MORIGUCHI |       | JP      |         |             |      4 | 03930270 |    0 | 360   | 271.8
03930270 | CHIHIRO   | NAKAMURA |        | MORIGUCHI |       | JP      |         |             |      4 | 03930270 |    0 | G9B   | 15.08
03930270 | SATOSHI   | KIKUYA   |        | KADOMA    |       | JP      |         |             |      2 | 03930270 |    1 | 360   | 130.24
03930270 | SATOSHI   | KIKUYA   |        | KADOMA    |       | JP      |         |             |      2 | 03930270 |    0 | 360   | 84
03930270 | SATOSHI   | KIKUYA   |        | KADOMA    |       | JP      |         |             |      2 | 03930270 |    0 | 360   | 271.8
03930270 | SATOSHI   | KIKUYA   |        | KADOMA    |       | JP      |         |             |      2 | 03930270 |    0 | G9B   | 15.08
03930270 | SHIZUHIKO | TANIGAWA |        | MORIGUCHI |       | JP      |         |             |      1 | 03930270 |    1 | 360   | 130.24
03930270 | SHIZUHIKO | TANIGAWA |        | MORIGUCHI |       | JP      |         |             |      1 | 03930270 |    0 | 360   | 84
03930270 | SHIZUHIKO | TANIGAWA |        | MORIGUCHI |       | JP      |         |             |      1 | 03930270 |    0 | 360   | 271.8
03930270 | SHIZUHIKO | TANIGAWA |        | MORIGUCHI |       | JP      |         |             |      1 | 03930270 |    0 | G9B   | 15.08
(16 rows)

The schema I used for this database is documented at the bottom of this page.

And this is also saved as a .sql script under

/bulk/Software/Database\ Scripts/createtables.sql

A corresponding Drop table scripts is also available at

/bulk/Software/Database\ Scripts/droptables.sql

Converting DataTypes

Method 1: Explicit recast

CAST function

Method 2: implicit recast

SELECT varname::type

Method 3: through a language

CREATE OR REPLACE FUNCTION chartoint (text) RETURNS int AS $$
    if ($_[0]) {
       if ($_[0] =~ m/^\d+$/) {
    	  return $_[0];
       } else {
         return -1;
       }
    }
    return undef;
$$ LANGUAGE plperl;


The scripts for cleaning up the tables are available under /Database\ Scripts/cleaning\ db.sql

Creating the tables

The following is the schema for the 10 tables in this database:

I have cleaned up and fixed the types for the the following tables:

CREATE TABLE patents (
    Patent integer,
    Kind varchar,   #only types NULL, A1, B1, B2
    Claims integer, 
    AppType integer, #int (http://www.uspto.gov/web/offices/ac/ido/oeip/taf/filingyr.htm) 29 is design
    AppNum integer,
    GDate date,      
    GYear integer,     
    AppDate date,
    AppYear integer
);
CREATE TABLE classes (
    Patent integer
    Prim integer,
    Class varchar,
    Subclass varchar
);
CREATE TABLE assignees (
    Patent integer,
    AsgType integer,
    Assignee varchar,
    City varchar,
    State varchar,
    Country varchar,
    Nationality varchar,
    Residence varchar,
    AsgSeq integer
);
CREATE TABLE citations (
    Patent integer,
    Cit_Date date,
    Cit_Name varchar,
    Cit_Kind varchar,
    Cit_Country varchar,
    Citation integer,
    Category varchar,
    CitSeq integer
);
CREATE TABLE inventors (
    Patent integer,
    Firstname varchar,
    Lastname varchar,
    Street varchar,
    City varchar,
    State varchar,
    Country varchar,
    Zipcode varchar, #??
    Nationality varchar,
    InvSeq integer
);
CREATE TABLE patdesc (
    Patent integer,
    Abstract varchar,
    Title varchar
);
CREATE TABLE lawyers (
    Patent integer,
    Firstname varchar,
    Lastname varchar,
    LawCountry varchar,
    OrgName varchar,
    LawSeq integer
);
CREATE TABLE scirefs (
    Patent integer,
    Descrip varchar,
    CitSeq integer
);
CREATE TABLE usreldocs (
    Patent integer,
    DocType varchar,
    OrderSeq integer,
    Country varchar,
    RelPatent varchar,
    Kind varchar,
    RelDate date,
    Status varchar
);
CREATE TABLE invpats (
    Firstname varchar,
    Lastname varchar,
    Street varchar,
    City varchar,
    State varchar,
    Country varchar,
    Zipcode integer,
    Lat double precision,
    Lon double precision,
    InvSeq integer,
    Patent integer,
    GYear integer,
    AppYear integer,
    AppDate date,
    Assignee varchar,
    AsgNum varchar,
    Class varchar,
    InvNum varchar,
    Low varchar,
    Up varchar
);

Some statistics for the Patent table

SELECT gyear, COUNT(*) FROM patents GROUP BY gyear ORDER BY gyear;
gyear | count  
-------+--------
 1975 |  72000
 1976 |  70226
 1977 |  65269
 1978 |  66102
 1979 |  48854
 1980 |  61819
 1981 |  65771
 1982 |  57888
 1983 |  56860
 1984 |  67200
 1985 |  71661
 1986 |  70860
 1987 |  82952
 1988 |  77924
 1989 |  95537
 1990 |  90364
 1991 |  96513
 1992 |  97444
 1993 |  98342
 1994 | 101676
 1995 | 101419
 1996 | 109645
 1997 | 111984
 1998 | 147574
 1999 | 153593
 2000 | 157596
 2001 | 166065
 2002 | 167424
 2003 | 169105
 2004 | 164413
 2005 | 143927
 2006 | 173922
 2007 | 157502
 2008 | 157894
 2009 | 167537
 2010 | 219909
(36 rows)
SELECT appyear, COUNT(*) FROM patents GROUP BY appyear ORDER BY appyear;
appyear | count  
---------+--------
   1901 |      3
   1902 |      1
   1903 |      4
   1904 |      4
   1905 |      1
   1908 |      1
   1909 |      1
   1915 |      1
   1918 |      1
   1921 |      1
   1922 |      1
   1925 |      1
   1929 |      1
   1930 |      2
   1931 |      2
   1933 |      1
   1936 |      2
   1938 |      4
   1939 |      1
   1940 |      3
   1941 |      3
   1942 |      1
   1943 |      4
   1944 |     19
   1945 |     18
   1946 |     11
   1947 |      4
   1948 |      8
   1949 |     10
   1950 |      9
   1951 |     11
   1952 |     17
   1953 |     12
   1954 |     30
   1955 |     33
   1956 |     33
   1957 |     32
   1958 |     39
   1959 |     56
   1960 |     73
   1961 |     85
   1962 |    111
   1963 |    110
   1964 |    147
   1965 |    167
   1966 |    193
   1967 |    230
   1968 |    337
   1969 |    584
   1970 |   1161
   1971 |   2658
   1972 |   9418
   1973 |  41655
   1974 |  65683
   1975 |  65902
   1976 |  65813
   1977 |  65999
   1978 |  65613
   1979 |  65729
   1980 |  66500
   1981 |  63930
   1982 |  65029
   1983 |  61578
   1984 |  67085
   1985 |  71463
   1986 |  75108
   1987 |  81499
   1988 |  90178
   1989 |  96139
   1990 |  99372
   1991 | 100262
   1992 | 103926
   1993 | 108346
   1994 | 123358
   1995 | 144655
   1996 | 144864
   1997 | 169578
   1998 | 168471
   1999 | 180655
   2000 | 196639
   2001 | 208908
   2002 | 206229
   2003 | 191514
   2004 | 181840
   2005 | 166524
   2006 | 139318
   2007 |  97911
   2008 |  48539
   2009 |  12245
   2010 |   1052
        |      2
(91 rows)


patentdata=# SELECT COUNT(*) FROM patents;
 count  
---------
3984771
(1 row)
patentdata=# SELECT kind, COUNT(*) FROM patents GROUP BY kind ORDER BY kind;
kind |  count  
------+---------
     | 2302931
A1   |       1
B1   |  561885
B2   | 1119954
(4 rows)
SELECT p.patent, COUNT(c.patent) AS numcited FROM patents AS p, CITATIONS AS c where p.patent=c.citation GROUP BY p.patent ORDER BY numcited DESC;
patent  | numcited 
---------+--------------
4683202 |         1992
4723129 |         1935
4683195 |         1814
4463359 |         1676
4740796 |         1638
4558333 |         1558
4345262 |         1537
4313124 |         1504
4459600 |         1461
4733665 |         1286
5103459 |         1118
5572643 |         1018
4901307 |          959
5143854 |          924
5523520 |          918
5643826 |          883
4655771 |          861
4340563 |          808
5742905 |          792
5892900 |          780
4799156 |          771
4816567 |          751
5172338 |          749
4886062 |          745
4776337 |          744
4965188 |          742
4800882 |          726
4580568 |          706
5710887 |          697
4665906 |          697
3953566 |          681
5056109 |          678
4405829 |          666
4739762 |          665
4503569 |          662
4608577 |          659
4179337 |          642
5794207 |          636
5064435 |          635
5530852 |          630
5272236 |          629
4100324 |          625
5708780 |          623
5608786 |          620
5109390 |          618
5715314 |          616
5923962 |          610
5101501 |          608
5774660 |          607
4908112 |          607