#!/usr/bin/env bash # download/get files.html # find (Events|Players|Openings updated): Month YYYY # => store in last update # => if one changed -> process # in files.html # div "Players" # => table below # => folder "players/" # div "Openings" # h2 "Modern Queen Pawn", "Classical Queen Pawn", "Modern King Pawn", "Classical King Pawn", "Flank and Unorthodox" # => table below each h2 # => folder "openings/" # div "Events" # h2 "Tournaments", "Candidates and Interzonals", "World Championships" function error() { if [ -t 1 ]; then echo "$1"; fi; exit 1; } BASE_FOLDER=$(dirname "$(readlink -f "$0")")"/"; TEMP="${BASE_FOLDER}../temp/"; DATA="${BASE_FOLDER}../data/"; DOWNLOAD="${BASE_FOLDER}../download/"; last_download="${DATA}last_download.txt"; last_update="${DATA}last_update.txt"; url_base_file="https://www.pgnmentor.com/files.html"; # has to be YYYYMMDD current_date="${1}"; # temp download temp_files_dl="${TEMP}files.html"; # set the download targets for file download_target_file="${DOWNLOAD}${current_date}/master/files.html"; download_target_pgn="${DOWNLOAD}${current_date}/pgn/"; # flag if we do anything do_download=0; # must have curl installed (or wget but I am lazy to code in both) type curl >/dev/null 2>&1 || error "curl not installed"; # date should be at least a bit valid if [ -z "${current_date}" ] || ! [[ "${current_date}" =~ ^[0-9]{8}$ ]]; then error "Current date must be set and in the format YYYYMMDD"; fi; # 1) last_download does not exist -> download # 2) download files.html to tmp # 3) if one true -> compare # 4) if different or 1) false -> download check_dl=0; if [ ! -f "${last_download}" ]; then check_dl=1; else curl -o "${temp_files_dl}" ${url_base_file}; grep "updated: " "${download_target_file}" > "${TEMP}cur_file_updated.txt"; file_diff=$(diff "${last_download}" "${TEMP}cur_file_updated.txt"); if [ -n "${file_diff}" ]; then check_dl=1; fi; fi; if [ $check_dl = 0 ]; then error "Last downloaded data not outdated"; fi; # normal download # does that download folder exist if [ ! -d "${DOWNLOAD}${current_date}" ]; then echo "Create new folders for ${current_date}"; # build basic folders build mkdir -p "${DOWNLOAD}${current_date}/master"; mkdir -p "${DOWNLOAD}${current_date}/pgn"; mkdir -p "${DATA}${current_date}"; # shift: current to last last_sym=$(readlink -f "${DATA}current"); echo "$last_sym => ${DATA}last"; rm "${DATA}last"; ln -sf "${last_sym}" "${DATA}last"; # shift: this to new current rm "${DATA}current"; ln -sf "${DATA}${current_date}" "${DATA}current"; fi; # download master file if [ ! -f "${download_target_file}" ]; then echo "Download master file for ${current_date}"; curl -o "${download_target_file}" ${url_base_file} fi; # diff current to last in download # grep diffs and download to corect folders # attach to main lists # diff -u old new |grep "+" |grep "a href" # parse level one: # get the last updated date and see if it is different to the last one # if [ ! -f "${last_download}" ]; then # do_download=1; # # check master file last update date # grep "updated: " "${download_target_file}" > "${last_download}"; # echo "DL: ${dl_latest}"; # else # # now we need to check # echo "check last download is old enough"; # last_data=$(cat "${last_update}"); # echo "L: ${last_data}"; # # eg # fi; # if [ $do_download -eq 0 ]; then # error "No updated data found"; # fi; # start main processing # TODO: limited download # load previous set, so we do not download the same data twice # all files in this files will not be downloaded # CURRENT: # use DownThemAll to get all the data # extract data files # check they exist in pgn folder -> if not # __END__