Where are the files?

All the links (17) and passwords are kept at the ministry of economics site:

Code to get the data is in the fetch_files directory

list.files("C:/Users/yoni/Documents/GitHub/supermarketprices/fetch_files/")
## [1] "bitan_fetch.r"     "cerberus_fetch.r"  "edenteva_fetch.r" 
## [4] "mega_fetch.r"      "shufersal_fetch.r"

Each file has 3 sections: Stores, Prices, Promotions

library(rvest);library(XML);library(stringr);library(plyr);library(dplyr);
## 
## Attaching package: 'XML'
## 
## The following object is masked from 'package:rvest':
## 
##     xml
## 
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#Organize URLs
  url.base="http://publishprice.mega.co.il"
  mega.files=html(paste(url.base,format(Sys.Date(),"%Y%m%d"),sep="/"))%>%html_nodes("a")%>%html_text()
  
  mega.files=mdply(c("Price","PriceFull","Promo","PromoFull","Store"),.fun = function(x){
                 df.out=data.frame(type=x,url=paste(url.base,format(Sys.Date(),"%Y%m%d"),
                   mega.files[grepl(".gz",mega.files)&grepl(x,mega.files)],sep="/"))})%>%
    select(-X1)%>%mutate(url=as.character(url))

mega.files%>%head
##    type
## 1 Price
## 2 Price
## 3 Price
## 4 Price
## 5 Price
## 6 Price
##                                                                               url
## 1 http://publishprice.mega.co.il/20150803/Price7290055700007-0159-201508032138.gz
## 2 http://publishprice.mega.co.il/20150803/Price7290055700007-4360-201508032136.gz
## 3 http://publishprice.mega.co.il/20150803/Price7290055700007-3720-201508032038.gz
## 4 http://publishprice.mega.co.il/20150803/Price7290055700007-3680-201508032038.gz
## 5 http://publishprice.mega.co.il/20150803/Price7290055700007-2330-201508031940.gz
## 6 http://publishprice.mega.co.il/20150803/Price7290055700007-3522-201508031938.gz
#Store List
  mega.store.files=mega.files%>%filter(type=="Store")%>%slice(1)
  mega.store.files
##    type
## 1 Store
##                                                                                    url
## 1 http://publishprice.mega.co.il/20150803/StoresFull7290055700007-0000-201508030102.gz
  temp <- tempfile()
  download.file(mega.store.files$url,temp,quiet = T)
  x=readLines(gzfile(temp))[c(1:20)]
  gsub(" ","",x)
##  [1] "<?xmlversion=\"1.0\"encoding=\"windows-1255\"?>"
##  [2] "<Stores>"                                       
##  [3] "<Header>Header</Header>"                        
##  [4] "<Store>"                                        
##  [5] "<ChainId>7290055700007</ChainId>"               
##  [6] "<SubChainId>5</SubChainId>"                     
##  [7] "<StoreId>10</StoreId>"                          
##  [8] "<BikoretNo>8</BikoretNo>"                       
##  [9] "<StoreType>1</StoreType>"                       
## [10] "<ChainName>רשתמגה</ChainName>"                  
## [11] "<SubChainName>מגהבעיר</SubChainName>"           
## [12] "<StoreName>מב.פנקס</StoreName>"                 
## [13] "<Address>46-50פנקס</Address>"                   
## [14] "<City>תל-אביב</City>"                           
## [15] "<ZipCode>201103</ZipCode>"                      
## [16] "<LastUpdateDate>20150803</LastUpdateDate>"      
## [17] "<LastUpdateTime>10259</LastUpdateTime>"         
## [18] "</Store>"                                       
## [19] "<Store>"                                        
## [20] "<ChainId>7290055700007</ChainId>"
  rm(x)
  unlink(temp)
  
  options(warn=-1)
  mega.stores=ddply(mega.store.files,.(type,url),.fun = function(x){
    temp <- tempfile()
    download.file(x$url,temp,quiet = T)
    mega.out=xmlToDataFrame(xmlParse(readLines(gzfile(temp),encoding="UTF-8")))[-1,-1]
    unlink(temp)
    return(mega.out)})%>%mutate_each(funs(iconv(.,"UTF-8")))%>%
    mutate(StoreURLId=str_pad(paste0(StoreId,BikoretNo),4,side = "left","0"))
  options(warn=0)
  
  mega.stores%>%select(-c(type,url))%>%head
##         ChainId SubChainId StoreId BikoretNo StoreType ChainName
## 1 7290055700007          5      10         8         1   רשת מגה
## 2 7290055700007          5      11         6         1   רשת מגה
## 3 7290055700007          5      13         2         1   רשת מגה
## 4 7290055700007          5      14         0         1   רשת מגה
## 5 7290055700007          5      15         9         1   רשת מגה
## 6 7290055700007          5      18         3         1   רשת מגה
##   SubChainName         StoreName        Address    City ZipCode
## 1     מגה בעיר          מב. פנקס     46-50 פנקס תל-אביב  201103
## 2     מגה בעיר  מב. כיכר דיזנגוף  9 ככר דיזנגוף תל-אביב  200909
## 3     מגה בעיר מב. ויצמן תל אביב       20 ויצמן תל-אביב  200411
## 4     מגה בעיר  מב. חיפה גרינברג 20 א.צ גרינברג    חיפה  201010
## 5     מגה בעיר   מב. רמת הטייסים         2 עודד תל-אביב  200411
## 6     מגה בעיר  מב. עוזיאל בת-ים       5 עוזיאל   בת-ים  200406
##   LastUpdateDate LastUpdateTime StoreURLId
## 1       20150803          10259       0108
## 2       20150803          10259       0116
## 3       20150803          10259       0132
## 4       20150803          10259       0140
## 5       20150803          10259       0159
## 6       20150803          10259       0183
#Prices
  mega.price.files=mega.files%>%filter(type=="PriceFull")%>%slice(1)
  options(warn=-1)
  mega.prices=ddply(mega.price.files,.(type,url),.fun = function(x){
    temp <- tempfile()
    download.file(x$url,temp,quiet = T)
    mega.out=xmlToDataFrame(xmlParse(readLines(gzfile(temp),encoding="UTF-8")))[-1,-1]
    unlink(temp)
    return(mega.out)},.progress = "text")%>%mutate_each(funs(iconv(.,"UTF-8")))
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
  options(warn=0)
  
  mega.prices%>%names
##  [1] "type"                        "url"                        
##  [3] "Chainid"                     "SubChainid"                 
##  [5] "Storeid"                     "BikoretNo"                  
##  [7] "PriceUpdateDate"             "PriceUpdateTime"            
##  [9] "ItemCode"                    "ItemType"                   
## [11] "ItemName"                    "ManufacturerName"           
## [13] "ManufacturerItemCountry"     "ManufacturerItemDescription"
## [15] "UnitQty"                     "Quantity"                   
## [17] "UnitOfMeasure"               "BlsWeighted"                
## [19] "QtyInPackage"                "ItemPrice"                  
## [21] "UnitOfMeasurePrice"          "AllowDiscount"              
## [23] "ItemStatus"
#  mega.prices%>%head

#Promotions (next time)

Javascript is used to fill out forms automatically in sites like cerebus. The RSelenium package is used to run the code. This is a lecture within itself. There is a ready coded file to fetch any cerebus chain store data cerebus_fetch.

The Givat Ram CS group (Gali Noti contact person) has all the code running every hour into the HUJI servers and parsing them into a sqlite database. They are getting the database ready for public consumption in the coming month or so. We have a slice of the raw data for anyone who wants it and a full days price data in an sqlite db.