Retrieving the Prices

An SQLite database was set up which has all the prices for each chain store that published to the web on 09/07/2015. This database has been archived (roughly 0.5gb archived and 3gb decompressed) and placed on the r-israel website for open access. The database can be accessed by many packages dplyr, RSQlite, DBI.

DPLYR

library(dplyr,quietly = T,warn.conflicts = F)
library(RSQLite,quietly = T,warn.conflicts = F)
chain_db <- src_sqlite(path = "D:/Prices/Single Day/chain_db.sqlite3", create = F)

#List tables in database
db_list_tables(chain_db$con)
## [1] "cerebus_prices_tbl"   "chain_list"           "coop_prices_tbl"     
## [4] "mega_prices_tbl"      "nibit_prices_tbl"     "rami_levi_addresses" 
## [7] "shufersal_prices_tbl" "sqlite_stat1"         "stores_all"
#The tables are split into data provider groups because of the uniform xml formats within each group, cerebus and nibit give data for 7 and 3 chains respectively. 

nibit=tbl(chain_db, "nibit_prices_tbl") #single day prices for all 3 chains in nibit

nibit%>%glimpse
## Observations: 815670
## Variables:
## $ X1                         (chr) "1", "1", "1", "1", "1", "1", "1", ...
## $ ChainID                    (chr) "7290661400001", "7290661400001", "...
## $ SubChainID                 (chr) "001", "001", "001", "001", "001", ...
## $ StoreID                    (chr) "071", "071", "071", "071", "071", ...
## $ BikoretNo                  (chr) "000", "000", "000", "000", "000", ...
## $ PriceUpdateDate            (chr) "2013/12/15 09:39", "2012/11/27 22:...
## $ ItemCode                   (chr) "1", "19", "26", "39", "61", "64", ...
## $ ItemType                   (chr) "0", "0", "0", "0", "0", "0", "0", ...
## $ ItemName                   (chr) "משלוח", "תרומות למשפחות נזקקות", "...
## $ ManufactureName            (chr) "משלוחים/תווים", "שונות /שרות", "שו...
## $ ManufactureCountry         (chr) "", "", "", "", "", "", "", "", "",...
## $ ManufactureItemDescription (chr) "", "", "", "", "", "", "", "", "",...
## $ UnitQty                    (chr) "י\"ח", "י\"ח", "י\"ח", "י\"ח", "ק\...
## $ Quantity                   (chr) "1", "1", "1", "1", "1", "1", "1", ...
## $ UnitMeasure                (chr) "", "י\"ח", "", "", "", "", "", "",...
## $ BisWeighted                (chr) "0", "0", "0", "0", "1", "1", "1", ...
## $ QtyInPackage               (chr) "1", "1", "1", "1", "1", "1", "1", ...
## $ ItemPrice                  (chr) "25", "5", "5", "10", "94.80", "69....
## $ UnitOfMeasurePrice         (chr) "25", "5", "5", "10", "94.80", "69....
## $ AllowDiscount              (chr) "1", "0", "0", "0", "0", "1", "0", ...
## $ itemStatus                 (chr) "0", "0", "0", "0", "0", "0", "0", ...
## $ LastUpdateDate             (chr) "", "", "", "", "", "", "", "", "",...
## $ LastUpdateTime             (chr) "", "", "", "", "", "", "", "", "",...
#Maximum Price in each store
nibit%>%group_by(ChainID,StoreID)%>%summarise(max(ItemPrice))
## Source: sqlite 3.8.6 [D:/Prices/Single Day/chain_db.sqlite3]
## From: <derived table> [?? x 3]
## Grouped by: ChainID 
## 
##          ChainID StoreID max(ItemPrice)
## 1  7290058179503     015          99.90
## 2  7290058179503     025          99.90
## 3  7290058179503     030          99.90
## 4  7290058179503     040          99.90
## 5  7290058179503     045             95
## 6  7290058179503     060          99.90
## 7  7290661400001     011          99.90
## 8  7290661400001     015          99.90
## 9  7290661400001     030          99.90
## 10 7290661400001     040          99.90
## ..           ...     ...            ...
#number of items in each store in stock for sale
nibit.count=nibit%>%select(ChainID,StoreID,ItemCode)%>%distinct%>%count(ChainID,StoreID)

nibit.count
## Source: sqlite 3.8.6 [D:/Prices/Single Day/chain_db.sqlite3]
## From: <derived table> [?? x 3]
## Grouped by: ChainID 
## 
##          ChainID StoreID     n
## 1  7290058179503     015 30641
## 2  7290058179503     025  6447
## 3  7290058179503     030  7919
## 4  7290058179503     040  5785
## 5  7290058179503     045  4497
## 6  7290058179503     060  5431
## 7  7290661400001     011 10662
## 8  7290661400001     015 12215
## 9  7290661400001     030 10552
## 10 7290661400001     040 12407
## ..           ...     ...   ...
#you will see that even though we are dealing with 3gb of information R is responding fast, this is because you arent actually retrieving the data in with the functions only the outputs.

chains=tbl(chain_db, "chain_list") #small files mapping chain names to chain ids
chains
## Source: sqlite 3.8.6 [D:/Prices/Single Day/chain_db.sqlite3]
## From: chain_list [16 x 3]
## 
##    chainname       chainid provider
## 1      bitan 7290725900003  private
## 2       eden 7290055755557  private
## 3       coop 7290633800006  private
## 4    doralon 7290492000005 cerberus
## 5  hazihinam 7290700100008 cerberus
## 6     keshet 7290785400000 cerberus
## 7       mega 7290055700007  private
## 8    osherad 7290103152017 cerberus
## 9   ramilevi 7290058140886 cerberus
## 10 shufersal 7290027600007  private
## 11 superdosh 7290873900009 cerberus
## 12 yohananof 0000000000000  cerebus
## 13 yohananof 7290803800003 cerberus
## 14   hashook 7290661400001    nibit
## 15     lahav 7290058179503    nibit
## 16   victory 7290696200003    nibit
left_join(nibit.count,chains,by="chainid") #notice that because dplyr is converting the syntax to sql it is not case sensitive anymore. 
## Source: sqlite 3.8.6 [D:/Prices/Single Day/chain_db.sqlite3]
## From: <derived table> [?? x 5]
## Grouped by: ChainID 
## 
##          ChainID StoreID     n chainname provider
## 1  7290058179503     015 30641     lahav    nibit
## 2  7290058179503     025  6447     lahav    nibit
## 3  7290058179503     030  7919     lahav    nibit
## 4  7290058179503     040  5785     lahav    nibit
## 5  7290058179503     045  4497     lahav    nibit
## 6  7290058179503     060  5431     lahav    nibit
## 7  7290661400001     011 10662   hashook    nibit
## 8  7290661400001     015 12215   hashook    nibit
## 9  7290661400001     030 10552   hashook    nibit
## 10 7290661400001     040 12407   hashook    nibit
## ..           ...     ...   ...       ...      ...
#If you want dplyr to put the query in the global work enviornment then add %>%collect to the end of the chain, but be careful the size of the table you collect.

class(chains)
## [1] "tbl_sqlite" "tbl_sql"    "tbl"
chains.df=chains%>%collect

class(chains.df)
## [1] "tbl_df"     "tbl"        "data.frame"
chains.df
## Source: local data frame [16 x 3]
## 
##    chainname       chainid provider
## 1      bitan 7290725900003  private
## 2       eden 7290055755557  private
## 3       coop 7290633800006  private
## 4    doralon 7290492000005 cerberus
## 5  hazihinam 7290700100008 cerberus
## 6     keshet 7290785400000 cerberus
## 7       mega 7290055700007  private
## 8    osherad 7290103152017 cerberus
## 9   ramilevi 7290058140886 cerberus
## 10 shufersal 7290027600007  private
## 11 superdosh 7290873900009 cerberus
## 12 yohananof 0000000000000  cerebus
## 13 yohananof 7290803800003 cerberus
## 14   hashook 7290661400001    nibit
## 15     lahav 7290058179503    nibit
## 16   victory 7290696200003    nibit
#adding a table to the database
load("C:/Users/yoni/Documents/GitHub/supermarketprices/Stores/stores.markers.rdata")

names(stores.markers)
##  [1] "chainid"        "lastupdatedate" "subchainid"     "storeid"       
##  [5] "bikoretno"      "storetype"      "chainname"      "subchainname"  
##  [9] "storename"      "address"        "city"           "zipcode"       
## [13] "xmldocversion"  "lastupdatetime" "latitude"       "longitude"     
## [17] "address.full"   "chainname_en"   "provider"
#replace '.' with '_' to make the table name compatable with sqlite 

stores_markers=stores.markers%>%slice(c(1:100))

#the class is converted to VARCHAR by sqlite (there is no NVARCHAR unfortunately). and the format is kept the same, for example:

stores_markers$chainid%>%head
## [1] 7.290726e+12 7.290726e+12 7.290726e+12 7.290726e+12 7.290726e+12
## [6] 7.290726e+12
#this is scientific format and will be read in as the same just as a character string. To correct this you can change the global option for number format to long 

options(scipen=999)

stores_markers$chainid%>%head
## [1] 7290725900003 7290725900003 7290725900003 7290725900003 7290725900003
## [6] 7290725900003
chains_sqlite <- copy_to(chain_db, 
                         stores_markers, 
                         temporary = FALSE, 
                         indexes = list(c("chainid"),
                                        c("storeid", "bikoretno")))

db_list_tables(chain_db$con)
##  [1] "cerebus_prices_tbl"   "chain_list"           "coop_prices_tbl"     
##  [4] "mega_prices_tbl"      "nibit_prices_tbl"     "rami_levi_addresses" 
##  [7] "shufersal_prices_tbl" "sqlite_stat1"         "stores_all"          
## [10] "stores_markers"
tbl(chain_db, "stores_markers")%>%glimpse
## Observations: 100
## Variables:
## $ chainid        (dbl) 7290725900003, 7290725900003, 7290725900003, 72...
## $ lastupdatedate (chr) "09/06/2015 17:24", "09/06/2015 17:24", "09/06/...
## $ subchainid     (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ storeid        (int) 1, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15,...
## $ bikoretno      (int) 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0,...
## $ storetype      (int) 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ chainname      (chr) "יינות ביתן מרכז הרשת", "יינות ביתן מרכז הרשת",...
## $ subchainname   (chr) "", "", "", "", "", "", "", "", "", "", "", "",...
## $ storename      (chr) "01 חאן", "03 ראשלצ", "04 צפוני", "05 חולון בית...
## $ address        (chr) "הרצל 33", "שמוטקין 29", "הפנינים 38 ת.ד 125", ...
## $ city           (chr) "אשקלון", "ראשון לציון", "אשקלון", "חולון", "אש...
## $ zipcode        (chr) "78601", "7536332", "7810101", "5845116", "7760...
## $ xmldocversion  (int) NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ lastupdatetime (chr) NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA, NA,...
## $ latitude       (dbl) 31.66516, 31.97930, 31.66852, 32.00289, 31.8117...
## $ longitude      (dbl) 34.58681, 34.81268, 34.60163, 34.77230, 34.6596...
## $ address.full   (chr) "הרצל 33,אשקלון", "שמוטקין 29,ראשון לציון", "הפ...
## $ chainname_en   (chr) "bitan", "bitan", "bitan", "bitan", "bitan", "b...
## $ provider       (chr) "private", "private", "private", "private", "pr...
#there are 100 rows right now
tbl(chain_db, "stores_markers")%>%nrow
## [1] 100
#appending the rest of the data.frame to the table
db_insert_into( con = chain_db$con, table = "stores_markers", values = stores.markers%>%slice(-c(1:100)))
## [1] TRUE
tbl(chain_db, "stores_markers")%>%nrow
## [1] 807
#removing a table
db_drop_table(chain_db$con,"stores_markers")

db_list_tables(chain_db$con)
## [1] "cerebus_prices_tbl"   "chain_list"           "coop_prices_tbl"     
## [4] "mega_prices_tbl"      "nibit_prices_tbl"     "rami_levi_addresses" 
## [7] "shufersal_prices_tbl" "sqlite_stat1"         "stores_all"
#closing connections
#if you rm(chain_db) then in the next garbage collection (gc) by R it will disconnect the connection.
#you can also force the disconnect by running
RSQLite::dbDisconnect(chain_db$con)
## [1] TRUE

RSQLite

# connect to the sqlite file
con <- dbConnect(RSQLite::SQLite(), dbname="D:/Prices/chain_db.sqlite3")
# get a list of all tables
dbListTables(con)
##  [1] "cerebus_prices_tbl"   "chain_list"           "coop_prices_tbl"     
##  [4] "mega_prices_tbl"      "nibit_prices_tbl"     "rami_levi_addresses" 
##  [7] "shufersal_prices_tbl" "sqlite_stat1"         "stores_all"          
## [10] "stores_markers"
#list variables in table stores_all
dbListFields(con,"stores_markers")
##  [1] "chainid"        "lastupdatedate" "subchainid"     "storeid"       
##  [5] "bikoretno"      "storetype"      "chainname"      "subchainname"  
##  [9] "storename"      "address"        "city"           "zipcode"       
## [13] "xmldocversion"  "lastupdatetime" "latitude"       "longitude"     
## [17] "address.full"   "chainname_en"   "provider"
#retrieve first 10 from list of distinct itemscode and itemnames that have general barcodes in mega

#Notice function calls in sqlite are case sensitive

dbGetQuery(con,"select distinct itemcode,itemtype,itemname from mega_prices_tbl where itemtype=1 LIMIT 5")
##      ItemCode ItemType                 ItemName
## 1 10900034197        1   שקית זיפר גדול דיימונד
## 2     1116592        1           גבט צרפתי כפרי
## 3 11182700923        1 שעועית אדומה בהירה 439גר
## 4 11182700930        1   שעועית אדומה כהה 439גר
## 5 11182700954        1      שעועית שחורה 439גרם
#A technical note on using REGEX or GLOB in sqlite with hebrew. It may have bugs that we havent figured out how to work with yet. 

dbGetQuery(con,"select distinct chainname from stores_markers")
##               chainname
## 1  יינות ביתן מרכז הרשת
## 2   רמי לוי שיווק השקמה
## 3               אושר עד
## 4              Dor Alon
## 5        כל בו חצי חינם
## 6             קשת טעמים
## 7              סופר דוש
## 8      מ. יוחננוף ובניו
## 9          עדן טבע מרקט
## 10              רשת מגה
## 11               שופרסל
## 12              ויקטורי
## 13           מחסני השוק
## 14            מחסני להב
dbGetQuery(con,"select distinct chainname from stores_markers where chainname like '%שופרסל%'")
##          chainname
## 1 מ. יוחננוף ובניו
## 2           שופרסל
## 3          ויקטורי
dbGetQuery(con,"select distinct chainname from stores_markers where chainname like '%ופרסל%'")
##              chainname
## 1 יינות ביתן מרכז הרשת
## 2  רמי לוי שיווק השקמה
## 3            קשת טעמים
## 4     מ. יוחננוף ובניו
## 5               שופרסל
## 6              ויקטורי
## 7           מחסני השוק
## 8            מחסני להב
#english and numbers work fine
dbGetQuery(con,"select distinct chainname_en from stores_markers")
##    chainname_en
## 1         bitan
## 2      ramilevi
## 3       osherad
## 4       doralon
## 5     hazihinam
## 6        keshet
## 7     superdosh
## 8     yohananof
## 9          eden
## 10         mega
## 11    shufersal
## 12      victory
## 13      hashook
## 14        lahav
dbGetQuery(con,"select distinct chainname,chainname_en from stores_markers where chainname_en like 's%'")
##   chainname chainname_en
## 1  סופר דוש    superdosh
## 2    שופרסל    shufersal
#close the connection to the DB when finished using
dbDisconnect(con)
## [1] TRUE