131 lines
3.6 KiB
Bash
Executable File
131 lines
3.6 KiB
Bash
Executable File
#!/usr/bin/env bash
|
|
|
|
# download/get files.html
|
|
# find (Events|Players|Openings updated): Month YYYY
|
|
# => store in last update
|
|
# => if one changed -> process
|
|
|
|
# in files.html
|
|
# div "Players"
|
|
# => table below
|
|
# => folder "players/"
|
|
|
|
# div "Openings"
|
|
# h2 "Modern Queen Pawn", "Classical Queen Pawn", "Modern King Pawn", "Classical King Pawn", "Flank and Unorthodox"
|
|
# => table below each h2
|
|
# => folder "openings/"
|
|
|
|
# div "Events"
|
|
# h2 "Tournaments", "Candidates and Interzonals", "World Championships"
|
|
|
|
function error() {
|
|
if [ -t 1 ]; then echo "$1"; fi; exit 1;
|
|
}
|
|
|
|
BASE_FOLDER=$(dirname "$(readlink -f "$0")")"/";
|
|
TEMP="${BASE_FOLDER}../temp/";
|
|
DATA="${BASE_FOLDER}../data/";
|
|
DOWNLOAD="${BASE_FOLDER}../download/";
|
|
last_download="${DATA}last_download.txt";
|
|
last_update="${DATA}last_update.txt";
|
|
url_base_file="https://www.pgnmentor.com/files.html";
|
|
# has to be YYYYMMDD
|
|
current_date="${1}";
|
|
# temp download
|
|
temp_files_dl="${TEMP}files.html";
|
|
# set the download targets for file
|
|
download_target_file="${DOWNLOAD}${current_date}/master/files.html";
|
|
download_target_pgn="${DOWNLOAD}${current_date}/pgn/";
|
|
# flag if we do anything
|
|
do_download=0;
|
|
|
|
# must have curl installed (or wget but I am lazy to code in both)
|
|
type curl >/dev/null 2>&1 || error "curl not installed";
|
|
# date should be at least a bit valid
|
|
if [ -z "${current_date}" ] || ! [[ "${current_date}" =~ ^[0-9]{8}$ ]]; then
|
|
error "Current date must be set and in the format YYYYMMDD";
|
|
fi;
|
|
|
|
# 1) last_download does not exist -> download
|
|
# 2) download files.html to tmp
|
|
# 3) if one true -> compare
|
|
# 4) if different or 1) false -> download
|
|
check_dl=0;
|
|
if [ ! -f "${last_download}" ]; then
|
|
check_dl=1;
|
|
else
|
|
curl -o "${temp_files_dl}" ${url_base_file};
|
|
grep "updated: " "${download_target_file}" > "${TEMP}cur_file_updated.txt";
|
|
file_diff=$(diff "${last_download}" "${TEMP}cur_file_updated.txt");
|
|
if [ -n "${file_diff}" ]; then
|
|
check_dl=1;
|
|
fi;
|
|
fi;
|
|
|
|
if [ $check_dl = 0 ]; then
|
|
error "Last downloaded data not outdated";
|
|
fi;
|
|
|
|
# normal download
|
|
|
|
# does that download folder exist
|
|
if [ ! -d "${DOWNLOAD}${current_date}" ]; then
|
|
echo "Create new folders for ${current_date}";
|
|
# build basic folders build
|
|
mkdir -p "${DOWNLOAD}${current_date}/master";
|
|
mkdir -p "${DOWNLOAD}${current_date}/pgn";
|
|
mkdir -p "${DATA}${current_date}";
|
|
# shift: current to last
|
|
last_sym=$(readlink -f "${DATA}current");
|
|
echo "$last_sym => ${DATA}last";
|
|
rm "${DATA}last";
|
|
ln -sf "${last_sym}" "${DATA}last";
|
|
# shift: this to new current
|
|
rm "${DATA}current";
|
|
ln -sf "${DATA}${current_date}" "${DATA}current";
|
|
fi;
|
|
|
|
# download master file
|
|
if [ ! -f "${download_target_file}" ]; then
|
|
echo "Download master file for ${current_date}";
|
|
curl -o "${download_target_file}" ${url_base_file}
|
|
fi;
|
|
|
|
# diff current to last in download
|
|
# grep diffs and download to corect folders
|
|
# attach to main lists
|
|
|
|
# diff -u old new |grep "+" |grep "a href"
|
|
|
|
# parse level one:
|
|
# get the last updated date and see if it is different to the last one
|
|
# if [ ! -f "${last_download}" ]; then
|
|
# do_download=1;
|
|
# # check master file last update date
|
|
# grep "updated: " "${download_target_file}" > "${last_download}";
|
|
# echo "DL: ${dl_latest}";
|
|
# else
|
|
# # now we need to check
|
|
# echo "check last download is old enough";
|
|
# last_data=$(cat "${last_update}");
|
|
# echo "L: ${last_data}";
|
|
# # eg
|
|
# fi;
|
|
|
|
# if [ $do_download -eq 0 ]; then
|
|
# error "No updated data found";
|
|
# fi;
|
|
|
|
# start main processing
|
|
|
|
# TODO: limited download
|
|
# load previous set, so we do not download the same data twice
|
|
# all files in this files will not be downloaded
|
|
|
|
# CURRENT:
|
|
# use DownThemAll to get all the data
|
|
# extract data files
|
|
# check they exist in pgn folder -> if not
|
|
|
|
# __END__
|