An SQLite database was set up which has all the prices for each chain store that published to the web on 09/07/2015. This database has been archived (roughly 0.5gb archived and 3gb decompressed) and placed on the r-israel website for open access. The database can be accessed by many packages dplyr, RSQlite, DBI.
library(dplyr,quietly = T,warn.conflicts = F)
library(RSQLite,quietly = T,warn.conflicts = F)
chain_db <- src_sqlite(path = "D:/Prices/Single Day/chain_db.sqlite3", create = F)
#List tables in database
db_list_tables(chain_db$con)
## [1] "cerebus_prices_tbl" "chain_list" "coop_prices_tbl"
## [4] "mega_prices_tbl" "nibit_prices_tbl" "rami_levi_addresses"
## [7] "shufersal_prices_tbl" "sqlite_stat1" "stores_all"
#The tables are split into data provider groups because of the uniform xml formats within each group, cerebus and nibit give data for 7 and 3 chains respectively.
nibit=tbl(chain_db, "nibit_prices_tbl") #single day prices for all 3 chains in nibit
nibit%>%glimpse
## Observations: 815670
## Variables:
## $ X1 (chr) "1", "1", "1", "1", "1", "1", "1", ...
## $ ChainID (chr) "7290661400001", "7290661400001", "...
## $ SubChainID (chr) "001", "001", "001", "001", "001", ...
## $ StoreID (chr) "071", "071", "071", "071", "071", ...
## $ BikoretNo (chr) "000", "000", "000", "000", "000", ...
## $ PriceUpdateDate (chr) "2013/12/15 09:39", "2012/11/27 22:...
## $ ItemCode (chr) "1", "19", "26", "39", "61", "64", ...
## $ ItemType (chr) "0", "0", "0", "0", "0", "0", "0", ...
## $ ItemName (chr) "משלוח", "תרומות למשפחות נזקקות", "...
## $ ManufactureName (chr) "משלוחים/תווים", "שונות /שרות", "שו...
## $ ManufactureCountry (chr) "", "", "", "", "", "", "", "", "",...
## $ ManufactureItemDescription (chr) "", "", "", "", "", "", "", "", "",...
## $ UnitQty (chr) "י\"ח", "י\"ח", "י\"ח", "י\"ח", "ק\...
## $ Quantity (chr) "1", "1", "1", "1", "1", "1", "1", ...
## $ UnitMeasure (chr) "", "י\"ח", "", "", "", "", "", "",...
## $ BisWeighted (chr) "0", "0", "0", "0", "1", "1", "1", ...
## $ QtyInPackage (chr) "1", "1", "1", "1", "1", "1", "1", ...
## $ ItemPrice (chr) "25", "5", "5", "10", "94.80", "69....
## $ UnitOfMeasurePrice (chr) "25", "5", "5", "10", "94.80", "69....
## $ AllowDiscount (chr) "1", "0", "0", "0", "0", "1", "0", ...
## $ itemStatus (chr) "0", "0", "0", "0", "0", "0", "0", ...
## $ LastUpdateDate (chr) "", "", "", "", "", "", "", "", "",...
## $ LastUpdateTime (chr) "", "", "", "", "", "", "", "", "",...
#Maximum Price in each store
nibit%>%group_by(ChainID,StoreID)%>%summarise(max(ItemPrice))
## Source: sqlite 3.8.6 [D:/Prices/Single Day/chain_db.sqlite3]
## From: <derived table> [?? x 3]
## Grouped by: ChainID
##
## ChainID StoreID max(ItemPrice)
## 1 7290058179503 015 99.90
## 2 7290058179503 025 99.90
## 3 7290058179503 030 99.90
## 4 7290058179503 040 99.90
## 5 7290058179503 045 95
## 6 7290058179503 060 99.90
## 7 7290661400001 011 99.90
## 8 7290661400001 015 99.90
## 9 7290661400001 030 99.90
## 10 7290661400001 040 99.90
## .. ... ... ...
#number of items in each store in stock for sale
nibit.count=nibit%>%select(ChainID,StoreID,ItemCode)%>%distinct%>%count(ChainID,StoreID)
nibit.count
## Source: sqlite 3.8.6 [D:/Prices/Single Day/chain_db.sqlite3]
## From: <derived table> [?? x 3]
## Grouped by: ChainID
##
## ChainID StoreID n
## 1 7290058179503 015 30641
## 2 7290058179503 025 6447
## 3 7290058179503 030 7919
## 4 7290058179503 040 5785
## 5 7290058179503 045 4497
## 6 7290058179503 060 5431
## 7 7290661400001 011 10662
## 8 7290661400001 015 12215
## 9 7290661400001 030 10552
## 10 7290661400001 040 12407
## .. ... ... ...
#you will see that even though we are dealing with 3gb of information R is responding fast, this is because you arent actually retrieving the data in with the functions only the outputs.
chains=tbl(chain_db, "chain_list") #small files mapping chain names to chain ids
chains
## Source: sqlite 3.8.6 [D:/Prices/Single Day/chain_db.sqlite3]
## From: chain_list [16 x 3]
##
## chainname chainid provider
## 1 bitan 7290725900003 private
## 2 eden 7290055755557 private
## 3 coop 7290633800006 private
## 4 doralon 7290492000005 cerberus
## 5 hazihinam 7290700100008 cerberus
## 6 keshet 7290785400000 cerberus
## 7 mega 7290055700007 private
## 8 osherad 7290103152017 cerberus
## 9 ramilevi 7290058140886 cerberus
## 10 shufersal 7290027600007 private
## 11 superdosh 7290873900009 cerberus
## 12 yohananof 0000000000000 cerebus
## 13 yohananof 7290803800003 cerberus
## 14 hashook 7290661400001 nibit
## 15 lahav 7290058179503 nibit
## 16 victory 7290696200003 nibit
left_join(nibit.count,chains,by="chainid") #notice that because dplyr is converting the syntax to sql it is not case sensitive anymore.
## Source: sqlite 3.8.6 [D:/Prices/Single Day/chain_db.sqlite3]
## From: <derived table> [?? x 5]
## Grouped by: ChainID
##
## ChainID StoreID n chainname provider
## 1 7290058179503 015 30641 lahav nibit
## 2 7290058179503 025 6447 lahav nibit
## 3 7290058179503 030 7919 lahav nibit
## 4 7290058179503 040 5785 lahav nibit
## 5 7290058179503 045 4497 lahav nibit
## 6 7290058179503 060 5431 lahav nibit
## 7 7290661400001 011 10662 hashook nibit
## 8 7290661400001 015 12215 hashook nibit
## 9 7290661400001 030 10552 hashook nibit
## 10 7290661400001 040 12407 hashook nibit
## .. ... ... ... ... ...
#If you want dplyr to put the query in the global work enviornment then add %>%collect to the end of the chain, but be careful the size of the table you collect.
class(chains)
## [1] "tbl_sqlite" "tbl_sql" "tbl"
chains.df=chains%>%collect
class(chains.df)
## [1] "tbl_df" "tbl" "data.frame"
chains.df
## Source: local data frame [16 x 3]
##
## chainname chainid provider
## 1 bitan 7290725900003 private
## 2 eden 7290055755557 private
## 3 coop 7290633800006 private
## 4 doralon 7290492000005 cerberus
## 5 hazihinam 7290700100008 cerberus
## 6 keshet 7290785400000 cerberus
## 7 mega 7290055700007 private
## 8 osherad 7290103152017 cerberus
## 9 ramilevi 7290058140886 cerberus
## 10 shufersal 7290027600007 private
## 11 superdosh 7290873900009 cerberus
## 12 yohananof 0000000000000 cerebus
## 13 yohananof 7290803800003 cerberus
## 14 hashook 7290661400001 nibit
## 15 lahav 7290058179503 nibit
## 16 victory 7290696200003 nibit
#adding a table to the database
load("C:/Users/yoni/Documents/GitHub/supermarketprices/Stores/stores.markers.rdata")
names(stores.markers)
## [1] "chainid" "lastupdatedate" "subchainid" "storeid"
## [5] "bikoretno" "storetype" "chainname" "subchainname"
## [9] "storename" "address" "city" "zipcode"
## [13] "xmldocversion" "lastupdatetime" "latitude" "longitude"
## [17] "address.full" "chainname_en" "provider"
#replace '.' with '_' to make the table name compatable with sqlite
stores_markers=stores.markers%>%slice(c(1:100))
#the class is converted to VARCHAR by sqlite (there is no NVARCHAR unfortunately). and the format is kept the same, for example:
stores_markers$chainid%>%head
## [1] 7.290726e+12 7.290726e+12 7.290726e+12 7.290726e+12 7.290726e+12
## [6] 7.290726e+12
#this is scientific format and will be read in as the same just as a character string. To correct this you can change the global option for number format to long
options(scipen=999)
stores_markers$chainid%>%head
## [1] 7290725900003 7290725900003 7290725900003 7290725900003 7290725900003
## [6] 7290725900003
chains_sqlite <- copy_to(chain_db,
stores_markers,
temporary = FALSE,
indexes = list(c("chainid"),
c("storeid", "bikoretno")))
db_list_tables(chain_db$con)
## [1] "cerebus_prices_tbl" "chain_list" "coop_prices_tbl"
## [4] "mega_prices_tbl" "nibit_prices_tbl" "rami_levi_addresses"
## [7] "shufersal_prices_tbl" "sqlite_stat1" "stores_all"
## [10] "stores_markers"
tbl(chain_db, "stores_markers")%>%glimpse
## Observations: 100
## Variables:
## $ chainid (dbl) 7290725900003, 7290725900003, 7290725900003, 72...
## $ lastupdatedate (chr) "09/06/2015 17:24", "09/06/2015 17:24", "09/06/...
## $ subchainid (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ storeid (int) 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,...
## $ bikoretno (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ storetype (int) 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ chainname (chr) "יינות ביתן מרכז הרשת", "יינות ביתן מרכז הרשת",...
## $ subchainname (chr) "", "", "", "", "", "", "", "", "", "", "", "",...
## $ storename (chr) "01 חאן", "03 ראשלצ", "04 צפוני", "05 חולון בית...
## $ address (chr) "הרצל 33", "שמוטקין 29", "הפנינים 38 ת.ד 125", ...
## $ city (chr) "אשקלון", "ראשון לציון", "אשקלון", "חולון", "אש...
## $ zipcode (chr) "78601", "7536332", "7810101", "5845116", "7760...
## $ xmldocversion (int) NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ lastupdatetime (chr) NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ latitude (dbl) 31.66516, 31.97930, 31.66852, 32.00289, 31.8117...
## $ longitude (dbl) 34.58681, 34.81268, 34.60163, 34.77230, 34.6596...
## $ address.full (chr) "הרצל 33,אשקלון", "שמוטקין 29,ראשון לציון", "הפ...
## $ chainname_en (chr) "bitan", "bitan", "bitan", "bitan", "bitan", "b...
## $ provider (chr) "private", "private", "private", "private", "pr...
#there are 100 rows right now
tbl(chain_db, "stores_markers")%>%nrow
## [1] 100
#appending the rest of the data.frame to the table
db_insert_into( con = chain_db$con, table = "stores_markers", values = stores.markers%>%slice(-c(1:100)))
## [1] TRUE
tbl(chain_db, "stores_markers")%>%nrow
## [1] 807
#removing a table
db_drop_table(chain_db$con,"stores_markers")
db_list_tables(chain_db$con)
## [1] "cerebus_prices_tbl" "chain_list" "coop_prices_tbl"
## [4] "mega_prices_tbl" "nibit_prices_tbl" "rami_levi_addresses"
## [7] "shufersal_prices_tbl" "sqlite_stat1" "stores_all"
#closing connections
#if you rm(chain_db) then in the next garbage collection (gc) by R it will disconnect the connection.
#you can also force the disconnect by running
RSQLite::dbDisconnect(chain_db$con)
## [1] TRUE
# connect to the sqlite file
con <- dbConnect(RSQLite::SQLite(), dbname="D:/Prices/chain_db.sqlite3")
# get a list of all tables
dbListTables(con)
## [1] "cerebus_prices_tbl" "chain_list" "coop_prices_tbl"
## [4] "mega_prices_tbl" "nibit_prices_tbl" "rami_levi_addresses"
## [7] "shufersal_prices_tbl" "sqlite_stat1" "stores_all"
## [10] "stores_markers"
#list variables in table stores_all
dbListFields(con,"stores_markers")
## [1] "chainid" "lastupdatedate" "subchainid" "storeid"
## [5] "bikoretno" "storetype" "chainname" "subchainname"
## [9] "storename" "address" "city" "zipcode"
## [13] "xmldocversion" "lastupdatetime" "latitude" "longitude"
## [17] "address.full" "chainname_en" "provider"
#retrieve first 10 from list of distinct itemscode and itemnames that have general barcodes in mega
#Notice function calls in sqlite are case sensitive
dbGetQuery(con,"select distinct itemcode,itemtype,itemname from mega_prices_tbl where itemtype=1 LIMIT 5")
## ItemCode ItemType ItemName
## 1 10900034197 1 שקית זיפר גדול דיימונד
## 2 1116592 1 גבט צרפתי כפרי
## 3 11182700923 1 שעועית אדומה בהירה 439גר
## 4 11182700930 1 שעועית אדומה כהה 439גר
## 5 11182700954 1 שעועית שחורה 439גרם
#A technical note on using REGEX or GLOB in sqlite with hebrew. It may have bugs that we havent figured out how to work with yet.
dbGetQuery(con,"select distinct chainname from stores_markers")
## chainname
## 1 יינות ביתן מרכז הרשת
## 2 רמי לוי שיווק השקמה
## 3 אושר עד
## 4 Dor Alon
## 5 כל בו חצי חינם
## 6 קשת טעמים
## 7 סופר דוש
## 8 מ. יוחננוף ובניו
## 9 עדן טבע מרקט
## 10 רשת מגה
## 11 שופרסל
## 12 ויקטורי
## 13 מחסני השוק
## 14 מחסני להב
dbGetQuery(con,"select distinct chainname from stores_markers where chainname like '%שופרסל%'")
## chainname
## 1 מ. יוחננוף ובניו
## 2 שופרסל
## 3 ויקטורי
dbGetQuery(con,"select distinct chainname from stores_markers where chainname like '%ופרסל%'")
## chainname
## 1 יינות ביתן מרכז הרשת
## 2 רמי לוי שיווק השקמה
## 3 קשת טעמים
## 4 מ. יוחננוף ובניו
## 5 שופרסל
## 6 ויקטורי
## 7 מחסני השוק
## 8 מחסני להב
#english and numbers work fine
dbGetQuery(con,"select distinct chainname_en from stores_markers")
## chainname_en
## 1 bitan
## 2 ramilevi
## 3 osherad
## 4 doralon
## 5 hazihinam
## 6 keshet
## 7 superdosh
## 8 yohananof
## 9 eden
## 10 mega
## 11 shufersal
## 12 victory
## 13 hashook
## 14 lahav
dbGetQuery(con,"select distinct chainname,chainname_en from stores_markers where chainname_en like 's%'")
## chainname chainname_en
## 1 סופר דוש superdosh
## 2 שופרסל shufersal
#close the connection to the DB when finished using
dbDisconnect(con)
## [1] TRUE