Various data updates
This commit is contained in:
54
bin/pgnmentor_download.py
Normal file
54
bin/pgnmentor_download.py
Normal file
@@ -0,0 +1,54 @@
|
||||
#!/usr/bin/env python3
|
||||
|
||||
"""
|
||||
Author: Clemens Schwaighofer
|
||||
Date: 2023/9/6
|
||||
Description: download pgn files from pgnmentor.com
|
||||
"""
|
||||
|
||||
# MARK:TOP
|
||||
|
||||
import requests
|
||||
import configparser
|
||||
|
||||
|
||||
class Config:
|
||||
"""
|
||||
folder locations and settings
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.base_folder: str = ""
|
||||
self.temp: str = "temp"
|
||||
self.data: str = "data"
|
||||
self.download: str = "download"
|
||||
self.last_download: str = "last_download.txt"
|
||||
self.last_update: str = "last_update.txt"
|
||||
|
||||
self.url_base_file: str = "https://www.pgnmentor.com/files.html"
|
||||
|
||||
self.download_target_file: str = "[download][curent_date]/msater/files.html"
|
||||
self.download_target_pgn: str = "[download][current_date]/pgn/"
|
||||
|
||||
|
||||
class Init:
|
||||
"""
|
||||
init on run
|
||||
- donwload file to temp
|
||||
- check update diff
|
||||
- run download and diff flow
|
||||
"""
|
||||
|
||||
def __init__(self, config: Config):
|
||||
self.conf = config
|
||||
|
||||
|
||||
def main():
|
||||
conf = Config()
|
||||
|
||||
print(f"BASE: {conf.base_folder}")
|
||||
|
||||
|
||||
main()
|
||||
|
||||
# __END__
|
||||
@@ -22,12 +22,109 @@ function error() {
|
||||
if [ -t 1 ]; then echo "$1"; fi; exit 1;
|
||||
}
|
||||
|
||||
BASE_FOLDER=$(dirname "$(readlink -f "$0")")"/";
|
||||
TEMP="${BASE_FOLDER}../temp/";
|
||||
DATA="${BASE_FOLDER}../data/";
|
||||
DOWNLOAD="${BASE_FOLDER}../download/";
|
||||
last_download="${DATA}last_download.txt";
|
||||
last_update="${DATA}last_update.txt";
|
||||
url_base_file="https://www.pgnmentor.com/files.html";
|
||||
# has to be YYYYMMDD
|
||||
current_date="${1}";
|
||||
# temp download
|
||||
temp_files_dl="${TEMP}files.html";
|
||||
# set the download targets for file
|
||||
download_target_file="${DOWNLOAD}${current_date}/master/files.html";
|
||||
download_target_pgn="${DOWNLOAD}${current_date}/pgn/";
|
||||
# flag if we do anything
|
||||
do_download=0;
|
||||
|
||||
# must have curl installed (or wget but I am lazy to code in both)
|
||||
type curl >/dev/null 2>&1 || error "curl not installed";
|
||||
# date should be at least a bit valid
|
||||
if [ -z "${current_date}" ] || ! [[ "${current_date}" =~ ^[0-9]{8}$ ]]; then
|
||||
error "Current date must be set and in the format YYYYMMDD";
|
||||
fi;
|
||||
|
||||
# 1) last_download does not exist -> download
|
||||
# 2) download files.html to tmp
|
||||
# 3) if one true -> compare
|
||||
# 4) if different or 1) false -> download
|
||||
check_dl=0;
|
||||
if [ ! -f "${last_download}" ]; then
|
||||
check_dl=1;
|
||||
else
|
||||
curl -o "${temp_files_dl}" ${url_base_file};
|
||||
grep "updated: " "${download_target_file}" > "${TEMP}cur_file_updated.txt";
|
||||
file_diff=$(diff "${last_download}" "${TEMP}cur_file_updated.txt");
|
||||
if [ -n "${file_diff}" ]; then
|
||||
check_dl=1;
|
||||
fi;
|
||||
fi;
|
||||
|
||||
if [ $check_dl = 0 ]; then
|
||||
error "Last downloaded data not outdated";
|
||||
fi;
|
||||
|
||||
# normal download
|
||||
|
||||
# does that download folder exist
|
||||
if [ ! -d "${DOWNLOAD}${current_date}" ]; then
|
||||
echo "Create new folders for ${current_date}";
|
||||
# build basic folders build
|
||||
mkdir -p "${DOWNLOAD}${current_date}/master";
|
||||
mkdir -p "${DOWNLOAD}${current_date}/pgn";
|
||||
mkdir -p "${DATA}${current_date}";
|
||||
# shift: current to last
|
||||
last_sym=$(readlink -f "${DATA}current");
|
||||
echo "$last_sym => ${DATA}last";
|
||||
rm "${DATA}last";
|
||||
ln -sf "${last_sym}" "${DATA}last";
|
||||
# shift: this to new current
|
||||
rm "${DATA}current";
|
||||
ln -sf "${DATA}${current_date}" "${DATA}current";
|
||||
fi;
|
||||
|
||||
# download master file
|
||||
if [ ! -f "${download_target_file}" ]; then
|
||||
echo "Download master file for ${current_date}";
|
||||
curl -o "${download_target_file}" ${url_base_file}
|
||||
fi;
|
||||
|
||||
# diff current to last in download
|
||||
# grep diffs and download to corect folders
|
||||
# attach to main lists
|
||||
|
||||
# diff -u old new |grep "+" |grep "a href"
|
||||
|
||||
# parse level one:
|
||||
# get the last updated date and see if it is different to the last one
|
||||
# if [ ! -f "${last_download}" ]; then
|
||||
# do_download=1;
|
||||
# # check master file last update date
|
||||
# grep "updated: " "${download_target_file}" > "${last_download}";
|
||||
# echo "DL: ${dl_latest}";
|
||||
# else
|
||||
# # now we need to check
|
||||
# echo "check last download is old enough";
|
||||
# last_data=$(cat "${last_update}");
|
||||
# echo "L: ${last_data}";
|
||||
# # eg
|
||||
# fi;
|
||||
|
||||
# if [ $do_download -eq 0 ]; then
|
||||
# error "No updated data found";
|
||||
# fi;
|
||||
|
||||
# start main processing
|
||||
|
||||
# TODO: limited download
|
||||
# load previous set, so we do not download the same data twice
|
||||
# all files in this files will not be downloaded
|
||||
|
||||
# CURRENT:
|
||||
# use DownThemAll to get all the data
|
||||
# extract data files
|
||||
# check they exist in pgn folder -> if not
|
||||
|
||||
# __END__
|
||||
|
||||
@@ -31,20 +31,20 @@ if [ ! -d "${folder}" ]; then
|
||||
exit;
|
||||
fi;
|
||||
|
||||
for i in ${files[*]}; do
|
||||
file=$(echo $i | sed -e "s/_/ /g")"."${current_date}".txt";
|
||||
output=$(echo $i | sed -e "s/_/ /g")".pgn";
|
||||
for i in "${files[@]}"; do
|
||||
file=${i//_/ }".${current_date}.txt";
|
||||
output=${i//_/ }".pgn";
|
||||
if [ -f "${file}" ]; then
|
||||
echo "OK: $file";
|
||||
rm -f "${output}";
|
||||
for pgn in $(cat "${file}"); do
|
||||
while read -r pgn; do
|
||||
if [ -f "${folder}${pgn}" ]; then
|
||||
cat "${folder}${pgn}" >> "${output}";
|
||||
echo -n "."
|
||||
else
|
||||
echo "[!!] Missing ${folder}${pgn}";
|
||||
fi;
|
||||
done;
|
||||
done <"${file}";
|
||||
echo "[DONE]";
|
||||
fi;
|
||||
done;
|
||||
|
||||
Reference in New Issue
Block a user