All the links (17) and passwords are kept at the ministry of economics site:
Code to get the data is in the fetch_files directory
list.files("C:/Users/yoni/Documents/GitHub/supermarketprices/fetch_files/")
## [1] "bitan_fetch.r" "cerberus_fetch.r" "edenteva_fetch.r"
## [4] "mega_fetch.r" "shufersal_fetch.r"
Each file has 3 sections: Stores, Prices, Promotions
library(rvest);library(XML);library(stringr);library(plyr);library(dplyr);
##
## Attaching package: 'XML'
##
## The following object is masked from 'package:rvest':
##
## xml
##
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#Organize URLs
url.base="http://publishprice.mega.co.il"
mega.files=html(paste(url.base,format(Sys.Date(),"%Y%m%d"),sep="/"))%>%html_nodes("a")%>%html_text()
mega.files=mdply(c("Price","PriceFull","Promo","PromoFull","Store"),.fun = function(x){
df.out=data.frame(type=x,url=paste(url.base,format(Sys.Date(),"%Y%m%d"),
mega.files[grepl(".gz",mega.files)&grepl(x,mega.files)],sep="/"))})%>%
select(-X1)%>%mutate(url=as.character(url))
mega.files%>%head
## type
## 1 Price
## 2 Price
## 3 Price
## 4 Price
## 5 Price
## 6 Price
## url
## 1 http://publishprice.mega.co.il/20150803/Price7290055700007-0159-201508032138.gz
## 2 http://publishprice.mega.co.il/20150803/Price7290055700007-4360-201508032136.gz
## 3 http://publishprice.mega.co.il/20150803/Price7290055700007-3720-201508032038.gz
## 4 http://publishprice.mega.co.il/20150803/Price7290055700007-3680-201508032038.gz
## 5 http://publishprice.mega.co.il/20150803/Price7290055700007-2330-201508031940.gz
## 6 http://publishprice.mega.co.il/20150803/Price7290055700007-3522-201508031938.gz
#Store List
mega.store.files=mega.files%>%filter(type=="Store")%>%slice(1)
mega.store.files
## type
## 1 Store
## url
## 1 http://publishprice.mega.co.il/20150803/StoresFull7290055700007-0000-201508030102.gz
temp <- tempfile()
download.file(mega.store.files$url,temp,quiet = T)
x=readLines(gzfile(temp))[c(1:20)]
gsub(" ","",x)
## [1] "<?xmlversion=\"1.0\"encoding=\"windows-1255\"?>"
## [2] "<Stores>"
## [3] "<Header>Header</Header>"
## [4] "<Store>"
## [5] "<ChainId>7290055700007</ChainId>"
## [6] "<SubChainId>5</SubChainId>"
## [7] "<StoreId>10</StoreId>"
## [8] "<BikoretNo>8</BikoretNo>"
## [9] "<StoreType>1</StoreType>"
## [10] "<ChainName>רשתמגה</ChainName>"
## [11] "<SubChainName>מגהבעיר</SubChainName>"
## [12] "<StoreName>מב.פנקס</StoreName>"
## [13] "<Address>46-50פנקס</Address>"
## [14] "<City>תל-אביב</City>"
## [15] "<ZipCode>201103</ZipCode>"
## [16] "<LastUpdateDate>20150803</LastUpdateDate>"
## [17] "<LastUpdateTime>10259</LastUpdateTime>"
## [18] "</Store>"
## [19] "<Store>"
## [20] "<ChainId>7290055700007</ChainId>"
rm(x)
unlink(temp)
options(warn=-1)
mega.stores=ddply(mega.store.files,.(type,url),.fun = function(x){
temp <- tempfile()
download.file(x$url,temp,quiet = T)
mega.out=xmlToDataFrame(xmlParse(readLines(gzfile(temp),encoding="UTF-8")))[-1,-1]
unlink(temp)
return(mega.out)})%>%mutate_each(funs(iconv(.,"UTF-8")))%>%
mutate(StoreURLId=str_pad(paste0(StoreId,BikoretNo),4,side = "left","0"))
options(warn=0)
mega.stores%>%select(-c(type,url))%>%head
## ChainId SubChainId StoreId BikoretNo StoreType ChainName
## 1 7290055700007 5 10 8 1 רשת מגה
## 2 7290055700007 5 11 6 1 רשת מגה
## 3 7290055700007 5 13 2 1 רשת מגה
## 4 7290055700007 5 14 0 1 רשת מגה
## 5 7290055700007 5 15 9 1 רשת מגה
## 6 7290055700007 5 18 3 1 רשת מגה
## SubChainName StoreName Address City ZipCode
## 1 מגה בעיר מב. פנקס 46-50 פנקס תל-אביב 201103
## 2 מגה בעיר מב. כיכר דיזנגוף 9 ככר דיזנגוף תל-אביב 200909
## 3 מגה בעיר מב. ויצמן תל אביב 20 ויצמן תל-אביב 200411
## 4 מגה בעיר מב. חיפה גרינברג 20 א.צ גרינברג חיפה 201010
## 5 מגה בעיר מב. רמת הטייסים 2 עודד תל-אביב 200411
## 6 מגה בעיר מב. עוזיאל בת-ים 5 עוזיאל בת-ים 200406
## LastUpdateDate LastUpdateTime StoreURLId
## 1 20150803 10259 0108
## 2 20150803 10259 0116
## 3 20150803 10259 0132
## 4 20150803 10259 0140
## 5 20150803 10259 0159
## 6 20150803 10259 0183
#Prices
mega.price.files=mega.files%>%filter(type=="PriceFull")%>%slice(1)
options(warn=-1)
mega.prices=ddply(mega.price.files,.(type,url),.fun = function(x){
temp <- tempfile()
download.file(x$url,temp,quiet = T)
mega.out=xmlToDataFrame(xmlParse(readLines(gzfile(temp),encoding="UTF-8")))[-1,-1]
unlink(temp)
return(mega.out)},.progress = "text")%>%mutate_each(funs(iconv(.,"UTF-8")))
##
|
| | 0%
|
|=================================================================| 100%
options(warn=0)
mega.prices%>%names
## [1] "type" "url"
## [3] "Chainid" "SubChainid"
## [5] "Storeid" "BikoretNo"
## [7] "PriceUpdateDate" "PriceUpdateTime"
## [9] "ItemCode" "ItemType"
## [11] "ItemName" "ManufacturerName"
## [13] "ManufacturerItemCountry" "ManufacturerItemDescription"
## [15] "UnitQty" "Quantity"
## [17] "UnitOfMeasure" "BlsWeighted"
## [19] "QtyInPackage" "ItemPrice"
## [21] "UnitOfMeasurePrice" "AllowDiscount"
## [23] "ItemStatus"
# mega.prices%>%head
#Promotions (next time)
Javascript is used to fill out forms automatically in sites like cerebus. The RSelenium package is used to run the code. This is a lecture within itself. There is a ready coded file to fetch any cerebus chain store data cerebus_fetch.
The Givat Ram CS group (Gali Noti contact person) has all the code running every hour into the HUJI servers and parsing them into a sqlite database. They are getting the database ready for public consumption in the coming month or so. We have a slice of the raw data for anyone who wants it and a full days price data in an sqlite db.