From 4fd2dfcee62398a944b6e6cdd951d001436dd6bc Mon Sep 17 00:00:00 2001 From: Connor Olding Date: Sat, 15 Mar 2014 11:56:56 -0700 Subject: [PATCH] split searching from filtering, log searches --- meow.sh | 75 ++++++++++++++++++++++++++++++++---------------------- run | 11 +++++--- scrape.awk | 5 +--- 3 files changed, 54 insertions(+), 37 deletions(-) diff --git a/meow.sh b/meow.sh index c7e2acc..2525362 100644 --- a/meow.sh +++ b/meow.sh @@ -1,11 +1,11 @@ #!/usr/bin/env bash -SEP=$'\1' +SEP=$'\t' curl=(curl -sS -m 32 --connect-timeout 8 --retry 3 --retry-delay 1) # all timestamps are given in seconds since the epoch -declare -A groupinsane # unsanitized group names -declare -A groupregex -declare -A grouptime # last seen release +declare -A searchquery +declare -A searchregex +declare -A searchtime # last seen release die() { echo -E "$@" >&2 @@ -17,8 +17,8 @@ retrieve() { "http://www.nyaa.se/" } -nullcheck() { # {group name} - [[ -n "$1" ]] || die "Null group name"; +nullcheck() { # {query} + [[ -n "$1" ]] || die "Null search query"; } sanitize() { @@ -29,67 +29,82 @@ splittags() { # {tag} awk -v tag="$1" -f "$SRCDIR/splittags.awk" } -scrape() { # {group name} {timestamp} - TZ=UTC0 awk -v g="$1" -v ts="${2:-0}" -v sep="$SEP" -f "$SRCDIR/scrape.awk" +scrape() { + TZ=UTC0 awk -v sep="$SEP" -f "$SRCDIR/scrape.awk" } -watch() { # {group name} [regex...] +watch() { # {search query} [regex...] nullcheck "$1" local gs="$(sanitize<<<"$1")" regex= - groupinsane[$gs]="$1" + searchquery[$gs]="$1" shift for regex; do - groupregex[$gs]+="|($regex)" + searchregex[$gs]+="|($regex)" done } -touchgroup() { # {group name} {timestamp} +touchquery() { # {search query} {timestamp} nullcheck "$1" local gs="$(sanitize<<<"$1")" - grouptime[$gs]="$2" + searchtime[$gs]="$2" } -groupreleases() { # groupname [timestamp] +search() { nullcheck "$1" - retrieve "$1" | tr -d '\r\n'"$SEP" | splittags item | scrape "$1" "${2:-}" - [ ${PIPESTATUS[0]} = 0 ] || die "Failed to retrieve releases for $1" + retrieve "$1" | tr -d '\r\n'"$SEP" | splittags item | scrape + [ ${PIPESTATUS[0]} = 0 ] || die "Failed to search for $1" } -groupfilter() { # groupname regex [timestamp] - groupreleases "$1" "${3:-}" | while IFS=$SEP read -r title etc; do +searchfilter() { # key regex [timestamp] + while IFS=$SEP read -r title etc; do grep -P "$2" <<< "$title" >/dev/null && echo -E "$title$SEP$etc" - done + done < db.txt [ ${PIPESTATUS[0]} = 0 ] || exit 1 } cleanup() { local gs= v= - for gs in "${!grouptime[@]}"; do - v="${grouptime[$gs]}" - echo -E "touchgroup $gs $v" >> times.sh + for gs in "${!searchtime[@]}"; do + v="${searchtime[$gs]}" + echo -E "touchquery $gs $v" >> times.sh [ -e "$gs.xml" ] && rm "$gs.xml" done exit ${1:-1} } -rungroup() { - local insane= regex= timestamp= res= _= recent= - insane="${groupinsane[$1]}" - regex="${groupregex[$1]:1}" - timestamp="${grouptime[$1]}" - res="$(groupfilter "$insane" "$regex" "$timestamp")" +runfilter() { + local query= regex= timestamp= res= _= recent= + query="${searchquery[$1]}" + regex="${searchregex[$1]:1}" # exclude first | character + timestamp="${searchtime[$1]}" + res="$(searchfilter "$query" "$regex" "$timestamp")" [ $? = 0 ] || return $? IFS=$SEP read -r _ _ recent <<< "$res" [ -n "$recent" ] && { - grouptime[$1]="$recent" + searchtime[$1]="$recent" echo -E "$res" } return 0 } +runsearch() { # [database] + local db="${1:-db.txt}" + local tmp=`mktemp` + touch "$db" + for q in "${!searchquery[@]}"; do + search "${searchquery[$q]}" \ + | while IFS=$SEP read -r title torrent time; do + echo -E "$time$SEP$q$SEP$title$SEP$torrent" + done + done | sort -n -- "$db" - | uniq > $tmp + # TODO: don't accidentally overwrite $db with something blank/incomplete + # maybe check if filesize has decreased and die if so + mv $tmp "$db" +} + runall() { trap cleanup INT local ret=0 gs= - for gs in "${!groupregex[@]}"; do rungroup "$gs" || ret=1; done + for gs in "${!searchregex[@]}"; do runfilter "$gs" || ret=1; done cleanup $ret } diff --git a/run b/run index 633be0e..e78cb3b 100755 --- a/run +++ b/run @@ -36,6 +36,11 @@ runactions() { . "$SRCDIR/meow.sh" . config.sh -[ -e times.sh ] && { . times.sh; mv times.sh times.sh.old; } -runall | runactions -exit ${PIPESTATUS[0]} + +#[ -e times.sh ] && { . times.sh; mv times.sh times.sh.old; } +#runall | runactions +#exit ${PIPESTATUS[0]} + +runsearch + +exit 0 diff --git a/scrape.awk b/scrape.awk index 0a1e78e..068c6af 100644 --- a/scrape.awk +++ b/scrape.awk @@ -22,16 +22,13 @@ BEGIN{ # http://stackoverflow.com/a/2123002 m=split("Jan|Feb|Mar|Apr|May|Jun|Jul|Aug|Sep|Oct|Nov|Dec",d,"|") for(o=1;o<=m;o++) months[d[o]]=sprintf("%02d",o) - glen=length(g) } function run(str) { title=unescape(opentag(str, "title")) - if (substr(title,1,glen+2) != "["g"]") return - pub=unescape(opentag($0, "pubDate")) + pub=unescape(opentag(str, "pubDate")) # "date -d \""pub "\" +%s" | getline pubunix pubunix=hotdate(pub) - if (pubunix <= ts) return torrent=unescape(opentag(str, "link")) print title sep torrent sep pubunix }