From 9c147406aa04f39e03df61bf5073c51c686fcc80 Mon Sep 17 00:00:00 2001 From: gs3144 Date: Tue, 26 Jan 2021 18:50:27 -0500 Subject: [PATCH] Upload Assignment 1 --- sql-project.Rmd | 270 +++++++++++++++++++----------------------------- 1 file changed, 105 insertions(+), 165 deletions(-) diff --git a/sql-project.Rmd b/sql-project.Rmd index 99a7974..9e3385e 100644 --- a/sql-project.Rmd +++ b/sql-project.Rmd @@ -1,6 +1,6 @@ --- title: "sql-workshop" -author: "Charles Lang" +author: "Guotai Sun" output: html_document --- @@ -8,15 +8,15 @@ Before you follow the directions below, please take a screenshot of your AWS con ## Connect to AWS MySQL Database ```{r} -#install.packages("DBI", "RMySQL") +install.packages("DBI", "RMySQL") library(DBI) library(RMySQL) -db_user <- 'admin' -db_password <- 'testsql!' +db_user <- 'gs3144' +db_password <- 'Michael840193' db_name <- 'oudb' -db_host <- 'PASTE YOUR ENDPOINT HERE' +db_host <- 'database-1.cl7ipplfercw.us-east-2.rds.amazonaws.com' db_port <- 3306 mydb <- dbConnect(MySQL(), user = db_user, password = db_password, dbname = db_name, host = db_host, port = db_port) @@ -26,31 +26,22 @@ summary(mydb) ## Load OU Data ```{r} -#Student demographic data -studentInfo <- read.csv("studentInfo.csv", header = TRUE) -#Student assessment data -studentAssessment <- read.csv("studentAssessment.csv", header = TRUE) -#Course data -courses <- read.csv("courses.csv", header = TRUE) -studentRegistration <- read.csv("studentRegistration.csv", header = TRUE) +PI <- read.csv("PI.csv", header = TRUE) + +PA <- read.csv("PA.csv", header = TRUE) + ``` ## Write data to the DB using the DBI package ```{r} -#List the tables in the DB - should be zero -dbListTables(mydb) +dbWriteTable(mydb, "PI", PI) -#Write a new table to the DB -dbWriteTable(mydb, "studentInfo", studentInfo) -dbWriteTable(mydb, "studentAssessment", studentAssessment) -dbWriteTable(mydb, "courses", courses) -dbWriteTable(mydb, "studentRegistration", studentRegistration) - -#List tables to see that table was added -dbListTables(mydb) +dbWriteTable(mydb, "PA", PA) #Read a particular table -dbReadTable(mydb, 'studentInfo') +dbReadTable(mydb, 'PI') + +dbReadTable(mydb, 'PA') #EXERCISE 1 #Make two toy data sets with at least three variables and at least 30 rows each in them. Have a mix of numeric and character variables. Transfer these dataframes to your SQL database using the DBI commands. Name the tables whatever you like. @@ -60,25 +51,21 @@ dbReadTable(mydb, 'studentInfo') ## Getting into SQL - READING ```{r} #Query a portion of the database (always returns dataframe) -dbGetQuery(mydb, "SELECT * FROM studentInfo LIMIT 10;") +dbGetQuery(mydb, "SELECT * FROM PI LIMIT 20;") -dbGetQuery(mydb, "SELECT * FROM studentInfo ORDER BY id_student LIMIT 10;") +dbGetQuery(mydb, "SELECT * FROM PI ORDER BY id_professor LIMIT 20;") -dbGetQuery(mydb, "SELECT id_student, gender FROM studentInfo ORDER BY id_student DESC LIMIT 10;") #Order listed will be reflected in order in table +dbGetQuery(mydb, "SELECT id_professor, gender FROM PI ORDER BY id_professor DESC LIMIT 10;") -dbGetQuery(mydb, "SELECT id_student AS 'Student ID', gender FROM studentInfo LIMIT 10;") #SQL Standard says quotes for literal strings and double quotes for everything else but that conflicts with R +dbGetQuery(mydb, "SELECT id_professor AS 'professor ID', gender FROM PI LIMIT 20;") -#Count the number of rows -dbGetQuery(mydb, "SELECT COUNT(*) FROM studentAssessment;") +dbGetQuery(mydb, "SELECT COUNT(*) FROM PA;") -#Using a WHERE statement on all columns -dbGetQuery(mydb, "SELECT COUNT(*) FROM studentAssessment WHERE score > 50;") +dbGetQuery(mydb, "SELECT COUNT(*) FROM PA WHERE date_submitted > 20;") -#Using a WHERE statement on a single column (will not include missing data) -dbGetQuery(mydb, "SELECT COUNT(score) FROM studentAssessment WHERE score > 50;") +dbGetQuery(mydb, "SELECT COUNT(date_submitted) FROM PA WHERE date_submitted > 20;") -#Using an AND statement -dbGetQuery(mydb, "SELECT COUNT(*) FROM studentAssessment WHERE score > 50 AND id_assessment = '1752';") +dbGetQuery(mydb, "SELECT COUNT(*) FROM PA WHERE date_submitted > 20 AND is_banked = '0';") #EXERCISE 2 #Read one of your toy data tables, make sure the output is ordered in descending order, you rename one of the variables and the output is limited to the first 20 rows. @@ -90,35 +77,35 @@ dbGetQuery(mydb, "SELECT COUNT(*) FROM studentAssessment WHERE score > 50 AND id ## Getting into SQL - UPDATING ```{r} #Count rows -dbGetQuery(mydb, "SELECT COUNT(*) FROM studentAssessment;") +dbGetQuery(mydb, "SELECT COUNT(*) FROM PA;") #Add a row -dbGetQuery(mydb, "INSERT INTO studentAssessment (id_assessment, id_student, date_submitted, is_banked, score) VALUES ('00001', '1', '20', '0', '50');") +dbGetQuery(mydb, "INSERT INTO PA (id_professor,date_submitted,is_banked) VALUES ('00001', '1', '0');") #Count rows again -dbGetQuery(mydb, "SELECT COUNT(*) FROM studentAssessment;") +dbGetQuery(mydb, "SELECT COUNT(*) FROM PA;") #View inserted row -dbGetQuery(mydb, "SELECT * FROM studentAssessment ORDER BY id_student LIMIT 10;") +dbGetQuery(mydb, "SELECT * FROM PA ORDER BY id_professor LIMIT 20;") #Add a row with missing values -dbGetQuery(mydb, "INSERT INTO studentAssessment (id_assessment, id_student, date_submitted) VALUES ('00001', '1', '20');") +dbGetQuery(mydb, "INSERT INTO PA (id_professor,date_submitted) VALUES ('00001', '1');") #View inserted row -dbGetQuery(mydb, "SELECT * FROM studentAssessment ORDER BY id_student LIMIT 10;") +dbGetQuery(mydb, "SELECT * FROM PA ORDER BY id_professor LIMIT 20;") #Update a row -dbGetQuery(mydb, "UPDATE studentAssessment SET score = '20' WHERE id_student = 1;") +dbGetQuery(mydb, "UPDATE PA SET is_banked = '1' WHERE id_professor = 00001;") -dbGetQuery(mydb, "SELECT id_student, score FROM studentAssessment ORDER BY id_student LIMIT 10;") +dbGetQuery(mydb, "SELECT id_professor,is_banked FROM PA ORDER BY id_professor LIMIT 20;") #Update a row with NULL -dbGetQuery(mydb, "UPDATE studentAssessment SET score = 'NULL' WHERE id_student = 6516;") +dbGetQuery(mydb, "UPDATE PA SET is_banked = 'NULL' WHERE id_professor = 65481;") #Delete a row (destructive) -dbGetQuery(mydb, "DELETE FROM studentAssessment WHERE id_student = 1;") +dbGetQuery(mydb, "DELETE FROM PA WHERE id_professor = 00001;") -dbGetQuery(mydb, "SELECT * FROM studentAssessment ORDER BY id_student LIMIT 10;") +dbGetQuery(mydb, "SELECT * FROM PAt ORDER BY id_professor LIMIT 20;") #EXERCISE 3 #Insert a new row in one of your toy data tables leaving one variable empty. Change one value in your other table. Display your new tables. Delete the row you edited and the row you inserted. @@ -130,23 +117,21 @@ dbGetQuery(mydb, "SELECT * FROM studentAssessment ORDER BY id_student LIMIT 10;" ```{r} #Creating a new table in SQL dbGetQuery(mydb,"CREATE TABLE test ( - score INTEGER, - student TEXT + id_professor INTEGER, + date_submitted INTEGER, + is_banked INTEGER );") -dbListTables(mydb) - #Inserting data into the table -dbGetQuery(mydb, "INSERT INTO test VALUES ( 10, 'Amy' );") -dbGetQuery(mydb, "INSERT INTO test VALUES ( 11, 'Jen' );") -dbGetQuery(mydb, "INSERT INTO test VALUES ( 9, 'Frank' );") +dbGetQuery(mydb, "INSERT INTO test VALUES ( 9998, 19,0 );") +dbGetQuery(mydb, "INSERT INTO test VALUES ( 9999, 20,0 );") +dbGetQuery(mydb, "INSERT INTO test VALUES ( 1000, 25,0 );") dbGetQuery(mydb, "SELECT * FROM test;") #Inserting a NULL row -dbGetQuery(mydb, "INSERT INTO test DEFAULT VALUES;") #Will not work use instead: -dbGetQuery(mydb,"INSERT INTO test (score, student) SELECT score, id_student FROM studentAssessment;") +dbGetQuery(mydb,"INSERT INTO test (id_professor,date_submitted,is_banked) SELECT id_professor,date_submitted,is_banked FROM PA;") #Delete a table dbGetQuery(mydb, "DROP TABLE test;") @@ -160,86 +145,61 @@ dbGetQuery(mydb, "DROP TABLE IF EXISTS test;") #No error since it is only if it #Create a table that is exactly the same as your first toy data table but this time use SQL commands. Display your new table. Then delete the original table. ``` - -# NULL Value -```{r} -#NULL is a state (similar to R), represents the lack of a value. But is not compatible with R backend so this code doesn't work as part of dbGetQuery() - -#This doesn't work because NULL is not a value -SELECT * FROM test WHERE score = NULL; - -#Instead use -SELECT * FROM test WHERE score is NULL; - -``` - # Constraints ```{r} -#Create table where student column *cannot* be NULL -dbGetQuery(mydb,"CREATE TABLE test2 ( - score INTEGER, - student TEXT NOT NULL + +dbGetQuery(mydb,"CREATE TABLE test9 ( + date_submitted INTEGER, + professor TEXT NOT NULL );") -dbGetQuery(mydb, "DROP TABLE IF EXISTS test2;") +dbGetQuery(mydb, "DROP TABLE IF EXISTS test9;") -dbGetQuery(mydb,"CREATE TABLE test2 ( - score INTEGER DEFAULT 0, - student TEXT +dbGetQuery(mydb,"CREATE TABLE test9 ( + date_submitted INTEGER DEFAULT 0, + professor TEXT );") -dbGetQuery(mydb,"INSERT INTO test2 (score, student) VALUES ('1', 'A');") -dbGetQuery(mydb,"INSERT INTO test2 (student) VALUES ('B');") +dbGetQuery(mydb,"INSERT INTO test9 (date_submitted, professor) VALUES ('19', 'A');") +dbGetQuery(mydb,"INSERT INTO test9 (professor) VALUES ('B');") -dbGetQuery(mydb, "SELECT * FROM test2;") +dbGetQuery(mydb, "SELECT * FROM test9;") -dbGetQuery(mydb, "DROP TABLE IF EXISTS test2;") - -dbGetQuery(mydb,"CREATE TABLE test2 ( - score INTEGER UNIQUE, - student TEXT - );") - -dbGetQuery(mydb,"INSERT INTO test2 (score, student) VALUES ('1', 'A');") - -#Error because of unique -dbGetQuery(mydb,"INSERT INTO test2 (score, student) VALUES ('1', 'A');") +dbGetQuery(mydb, "DROP TABLE IF EXISTS test9;") #NULL is exempt -dbGetQuery(mydb,"INSERT INTO test2 (score, student) VALUES (NULL, 'A');") -dbGetQuery(mydb,"INSERT INTO test2 (score, student) VALUES (NULL, 'A');") +dbGetQuery(mydb,"INSERT INTO test9 (date_submitted, professor) VALUES (NULL, 'A');") +dbGetQuery(mydb,"INSERT INTO test9 (date_submitted, professsor) VALUES (NULL, 'A');") #EXERCISE 5 #Recreate one of your toy data tables with the constraint that for one of the integer variablesthe default value will be zero. Test your table by inserting some empty values. Display your new tables. Then delete your table. ``` - - # Adding a column with a default value ```{r} #Add a column with default value 1 -dbGetQuery(mydb, "ALTER TABLE studentAssessment ADD email INTEGER DEFAULT 1 ") +dbGetQuery(mydb, "ALTER TABLE PA ADD email INTEGER DEFAULT 3 ") -dbGetQuery(mydb, "SELECT * FROM studentAssessment LIMIT 10;") +dbGetQuery(mydb, "SELECT * FROM PA LIMIT 20;") #Delete a column -dbGetQuery(mydb, "ALTER TABLE studentAssessment DROP COLUMN email;") +dbGetQuery(mydb, "ALTER TABLE PA DROP COLUMN email;") #EXERCISE 6 #Add a column to one of your toy data tables with a default value of 3. Display your new table. Delete this column. ``` - - # ID Columns ```{r} dbGetQuery(mydb,"CREATE TABLE test3 ( - id INTEGER AUTO_INCREMENT PRIMARY KEY, #Not standard syntax - score INTEGER, - student TEXT + id INTEGER AUTO_INCREMENT PRIMARY KEY, + age INTEGER, + score INTEGER, + is_banked INTEGER, + professor TEXT );") -dbGetQuery(mydb,"INSERT INTO test3 (score, student) VALUES (1, 'A');") -dbGetQuery(mydb,"INSERT INTO test3 (score, student) VALUES (5, 'B');") +dbGetQuery(mydb,"INSERT INTO test3 (age,score, is_banked, professor) VALUES (25,90,0, 'A');") +dbGetQuery(mydb,"INSERT INTO test3 (age,score, is_banked, professor) VALUES (30,95,0, 'B');") dbGetQuery(mydb, "SELECT * FROM test3;") @@ -252,28 +212,13 @@ dbGetQuery(mydb, "DROP TABLE IF EXISTS test3;") ## Filtering (WHERE) ```{r} -dbGetQuery(mydb, "SELECT id_student, date_submitted FROM studentAssessment WHERE date_submitted > 550 ORDER BY date_submitted DESC;") - -#OR Statement -dbGetQuery(mydb, "SELECT id_student, date_submitted FROM studentAssessment WHERE date_submitted > 550 OR date_submitted < 2 ORDER BY date_submitted DESC;") - -#AND Statement -dbGetQuery(mydb, "SELECT id_student, date_submitted FROM studentAssessment WHERE date_submitted > 550 AND id_student = 325750 ORDER BY date_submitted DESC;") +dbGetQuery(mydb, "SELECT id_professor, date_submitted FROM PA WHERE date_submitted < 49 ORDER BY date_submitted DESC;") #LIKE -dbGetQuery(mydb, "SELECT id_student, gender, region FROM studentInfo WHERE region LIKE '%Region%';") - -#Begin with 'Region' -dbGetQuery(mydb, "SELECT id_student, gender, region FROM studentInfo WHERE region LIKE 'Region%';") - -#End with 'Region' -dbGetQuery(mydb, "SELECT id_student, gender, region FROM studentInfo WHERE region LIKE '%Region';") +dbGetQuery(mydb, "SELECT id_professor, gender, region FROM PI WHERE region LIKE 'South Region';") #'c' is the second letter -dbGetQuery(mydb, "SELECT id_student, gender, region FROM studentInfo WHERE region LIKE '_c%';") - -#IN -dbGetQuery(mydb, "SELECT id_student, gender, region FROM studentInfo WHERE region IN ('Wales','Ireland');") +dbGetQuery(mydb, "SELECT id_professor, gender, region FROM PI WHERE region LIKE '_r%';") #EXERCISE 8 #Query one of your original toy data tables, for two different conditions. @@ -282,9 +227,9 @@ dbGetQuery(mydb, "SELECT id_student, gender, region FROM studentInfo WHERE regio ## Removing Duplicates ```{r} -dbGetQuery(mydb, "SELECT DISTINCT region FROM studentInfo;") +dbGetQuery(mydb, "SELECT DISTINCT region FROM PI;") -dbGetQuery(mydb, "SELECT DISTINCT region, gender FROM studentInfo;") +dbGetQuery(mydb, "SELECT DISTINCT region, gender FROM PI;") #EXERCISE 9 #Insert a duplicate row into one of your toy data tables. Then query the table without including duplicates. @@ -311,50 +256,45 @@ dbGetQuery(mydb,"SELECT #Relationships (JOIN) - *Slide* ```{r} -#Create two tables with matches and join them - -dbGetQuery(mydb, "CREATE TABLE left_table (id INTEGER, description TEXT);") -dbGetQuery(mydb, "CREATE TABLE right_table (id INTEGER, description TEXT);") - -dbGetQuery(mydb, "INSERT INTO left_table VALUES ( 1, 'left 01');") -dbGetQuery(mydb, "INSERT INTO left_table VALUES ( 2, 'left 02');") -dbGetQuery(mydb, "INSERT INTO left_table VALUES ( 3, 'left 03');") -dbGetQuery(mydb, "INSERT INTO left_table VALUES ( 4, 'left 04');") -dbGetQuery(mydb, "INSERT INTO left_table VALUES ( 5, 'left 05');") -dbGetQuery(mydb, "INSERT INTO left_table VALUES ( 6, 'left 06');") -dbGetQuery(mydb, "INSERT INTO left_table VALUES ( 7, 'left 07');") -dbGetQuery(mydb, "INSERT INTO left_table VALUES ( 8, 'left 08');") -dbGetQuery(mydb, "INSERT INTO left_table VALUES ( 9, 'left 09');") - -dbGetQuery(mydb, "INSERT INTO right_table VALUES ( 6, 'left 06');") -dbGetQuery(mydb, "INSERT INTO right_table VALUES ( 7, 'left 07');") -dbGetQuery(mydb, "INSERT INTO right_table VALUES ( 8, 'left 08');") -dbGetQuery(mydb, "INSERT INTO right_table VALUES ( 9, 'left 09');") -dbGetQuery(mydb, "INSERT INTO right_table VALUES ( 10, 'left 10');") -dbGetQuery(mydb, "INSERT INTO right_table VALUES ( 11, 'left 11');") -dbGetQuery(mydb, "INSERT INTO right_table VALUES ( 12, 'left 12');") -dbGetQuery(mydb, "INSERT INTO right_table VALUES ( 13, 'left 13');") -dbGetQuery(mydb, "INSERT INTO right_table VALUES ( 14, 'left 14');") - -dbGetQuery(mydb, "SELECT * FROM left_table;") -dbGetQuery(mydb, "SELECT * FROM right_table;") - -dbGetQuery(mydb,"SELECT l.description AS left_table, r.description AS right_table - FROM left_table AS l - JOIN right_table AS r ON l.id = r.id") - -dbGetQuery(mydb,"SELECT l.description AS left_table, r.description AS right_table - FROM left_table AS l - RIGHT JOIN right_table AS r ON l.id = r.id") - -dbGetQuery(mydb,"SELECT l.description AS left_table, r.description AS right_table - FROM left_table AS l - LEFT JOIN right_table AS r ON l.id = r.id") +dbGetQuery(mydb, "INSERT INTO PA (id_professor,date_submitted,is_banked) VALUES ( 1, 19,0);") +dbGetQuery(mydb, "INSERT INTO PA (id_professor,date_submitted,is_banked) VALUES ( 2, 22,0);") +dbGetQuery(mydb, "INSERT INTO PA (id_professor,date_submitted,is_banked) VALUES ( 3, 25,0);") +dbGetQuery(mydb, "INSERT INTO PA (id_professor,date_submitted,is_banked) VALUES ( 4, 23,0);") +dbGetQuery(mydb, "INSERT INTO PA (id_professor,date_submitted,is_banked) VALUES ( 5, 24,0);") +dbGetQuery(mydb, "INSERT INTO PA (id_professor,date_submitted,is_banked) VALUES ( 6, 18,0);") +dbGetQuery(mydb, "INSERT INTO PA (id_professor,date_submitted,is_banked) VALUES ( 7, 19,0);") +dbGetQuery(mydb, "INSERT INTO PA (id_professor,date_submitted,is_banked) VALUES ( 8, 26,0);") +dbGetQuery(mydb, "INSERT INTO PA (id_professor,date_submitted,is_banked) VALUES ( 9, 24,0);") + +dbGetQuery(mydb, "INSERT INTO PI (id_professor,gender,region) VALUES ( 6, 'M', 'NA');") +dbGetQuery(mydb, "INSERT INTO PI (id_professor,gender,region) VALUES ( 7, 'M', 'NA');") +dbGetQuery(mydb, "INSERT INTO PI (id_professor,gender,region) VALUES ( 8, 'M', 'NA');") +dbGetQuery(mydb, "INSERT INTO PI (id_professor,gender,region) VALUES ( 9, 'F', 'NA');") +dbGetQuery(mydb, "INSERT INTO PI (id_professor,gender,region) VALUES ( 10, 'F', 'CN');") +dbGetQuery(mydb, "INSERT INTO PI (id_professor,gender,region) VALUES ( 11, 'F', 'CN');") +dbGetQuery(mydb, "INSERT INTO PI (id_professor,gender,region) VALUES ( 12, 'M', 'CN');") +dbGetQuery(mydb, "INSERT INTO PI (id_professor,gender,region) VALUES ( 13, 'F', 'NA');") +dbGetQuery(mydb, "INSERT INTO PI (id_professor,gender,region) VALUES ( 14, 'M', 'CN');") + +dbGetQuery(mydb, "SELECT * FROM PA;") +dbGetQuery(mydb, "SELECT * FROM PI;") + +dbGetQuery(mydb,"SELECT PA.id_professor AS PA, PI.id_professor AS PI + FROM PA AS PA + JOIN PI AS PI ON PA.id_professor = PI.id_professor") + +dbGetQuery(mydb,"SELECT PA.id_professor AS PA, PI.id_professor AS PI + FROM PA AS PA + RIGHT JOIN PI AS PI ON PA.id_professor = PI.id_professor") + +dbGetQuery(mydb,"SELECT PA.id_professor AS PA, PI.id_professor AS PI + FROM PA AS PA + LEFT JOIN PI AS PI ON PA.id_professor = PI.id_professor") #Union -dbGetQuery(mydb, "SELECT * FROM left_table +dbGetQuery(mydb, "SELECT * FROM PA UNION - SELECT * FROM right_table;") + SELECT * FROM PI;") #EXERCISE 10