meow.sh/scrape.awk

37 lines
957 B
Awk
Raw Permalink Normal View History

2013-05-25 19:15:43 -07:00
function opentag(str, tag) {
2014-03-15 08:37:34 -07:00
topen="<"tag">"
tclose="</"tag">"
len=length(tag)
begin=index(str, topen)
end=index(str, tclose)
return (begin && end) ? substr(str, begin+len+2, end-begin-len-2) : ""
2013-05-25 19:15:43 -07:00
}
function unescape(str) {
2014-03-15 08:37:34 -07:00
while (match(str, /&#([0-9]+);/, b))
str=substr(str, 1, RSTART-1) sprintf("%c",b[1]) substr(str, RSTART+RLENGTH)
return str
2013-05-25 19:15:43 -07:00
}
function hotdate(str) {
2014-03-15 08:37:34 -07:00
split(str, d, "[ :]")
return mktime(d[4]" "months[d[3]]" "d[2]" "d[5]" "d[6]" "d[7])
2013-05-25 19:15:43 -07:00
}
BEGIN{
2014-03-15 08:37:34 -07:00
# http://stackoverflow.com/a/2123002
m=split("Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec",d,"|")
for(o=1;o<=m;o++) months[d[o]]=sprintf("%02d",o)
2013-05-25 19:15:43 -07:00
}
function run(str) {
2014-03-15 08:37:34 -07:00
title=unescape(opentag(str, "title"))
pub=unescape(opentag(str, "pubDate"))
2014-03-15 08:37:34 -07:00
# "date -d \""pub "\" +%s" | getline pubunix
pubunix=hotdate(pub)
torrent=unescape(opentag(str, "link"))
print title sep torrent sep pubunix
2013-05-25 19:15:43 -07:00
}
{run($0)}