Various data updates

This commit is contained in:
2024-07-12 23:35:14 +09:00
parent 4ad609c759
commit e783b3f6b1
18 changed files with 201 additions and 1441 deletions

54
bin/pgnmentor_download.py Normal file
View File

@@ -0,0 +1,54 @@
#!/usr/bin/env python3
"""
Author: Clemens Schwaighofer
Date: 2023/9/6
Description: download pgn files from pgnmentor.com
"""
# MARK:TOP
import requests
import configparser
class Config:
"""
folder locations and settings
"""
def __init__(self):
self.base_folder: str = ""
self.temp: str = "temp"
self.data: str = "data"
self.download: str = "download"
self.last_download: str = "last_download.txt"
self.last_update: str = "last_update.txt"
self.url_base_file: str = "https://www.pgnmentor.com/files.html"
self.download_target_file: str = "[download][curent_date]/msater/files.html"
self.download_target_pgn: str = "[download][current_date]/pgn/"
class Init:
"""
init on run
- donwload file to temp
- check update diff
- run download and diff flow
"""
def __init__(self, config: Config):
self.conf = config
def main():
conf = Config()
print(f"BASE: {conf.base_folder}")
main()
# __END__

View File

@@ -22,12 +22,109 @@ function error() {
if [ -t 1 ]; then echo "$1"; fi; exit 1;
}
BASE_FOLDER=$(dirname "$(readlink -f "$0")")"/";
TEMP="${BASE_FOLDER}../temp/";
DATA="${BASE_FOLDER}../data/";
DOWNLOAD="${BASE_FOLDER}../download/";
last_download="${DATA}last_download.txt";
last_update="${DATA}last_update.txt";
url_base_file="https://www.pgnmentor.com/files.html";
# has to be YYYYMMDD
current_date="${1}";
# temp download
temp_files_dl="${TEMP}files.html";
# set the download targets for file
download_target_file="${DOWNLOAD}${current_date}/master/files.html";
download_target_pgn="${DOWNLOAD}${current_date}/pgn/";
# flag if we do anything
do_download=0;
# must have curl installed (or wget but I am lazy to code in both)
type curl >/dev/null 2>&1 || error "curl not installed";
# date should be at least a bit valid
if [ -z "${current_date}" ] || ! [[ "${current_date}" =~ ^[0-9]{8}$ ]]; then
error "Current date must be set and in the format YYYYMMDD";
fi;
# 1) last_download does not exist -> download
# 2) download files.html to tmp
# 3) if one true -> compare
# 4) if different or 1) false -> download
check_dl=0;
if [ ! -f "${last_download}" ]; then
check_dl=1;
else
curl -o "${temp_files_dl}" ${url_base_file};
grep "updated: " "${download_target_file}" > "${TEMP}cur_file_updated.txt";
file_diff=$(diff "${last_download}" "${TEMP}cur_file_updated.txt");
if [ -n "${file_diff}" ]; then
check_dl=1;
fi;
fi;
if [ $check_dl = 0 ]; then
error "Last downloaded data not outdated";
fi;
# normal download
# does that download folder exist
if [ ! -d "${DOWNLOAD}${current_date}" ]; then
echo "Create new folders for ${current_date}";
# build basic folders build
mkdir -p "${DOWNLOAD}${current_date}/master";
mkdir -p "${DOWNLOAD}${current_date}/pgn";
mkdir -p "${DATA}${current_date}";
# shift: current to last
last_sym=$(readlink -f "${DATA}current");
echo "$last_sym => ${DATA}last";
rm "${DATA}last";
ln -sf "${last_sym}" "${DATA}last";
# shift: this to new current
rm "${DATA}current";
ln -sf "${DATA}${current_date}" "${DATA}current";
fi;
# download master file
if [ ! -f "${download_target_file}" ]; then
echo "Download master file for ${current_date}";
curl -o "${download_target_file}" ${url_base_file}
fi;
# diff current to last in download
# grep diffs and download to corect folders
# attach to main lists
# diff -u old new |grep "+" |grep "a href"
# parse level one:
# get the last updated date and see if it is different to the last one
# if [ ! -f "${last_download}" ]; then
# do_download=1;
# # check master file last update date
# grep "updated: " "${download_target_file}" > "${last_download}";
# echo "DL: ${dl_latest}";
# else
# # now we need to check
# echo "check last download is old enough";
# last_data=$(cat "${last_update}");
# echo "L: ${last_data}";
# # eg
# fi;
# if [ $do_download -eq 0 ]; then
# error "No updated data found";
# fi;
# start main processing
# TODO: limited download
# load previous set, so we do not download the same data twice
# all files in this files will not be downloaded
# CURRENT:
# use DownThemAll to get all the data
# extract data files
# check they exist in pgn folder -> if not
# __END__

View File

@@ -31,20 +31,20 @@ if [ ! -d "${folder}" ]; then
exit;
fi;
for i in ${files[*]}; do
file=$(echo $i | sed -e "s/_/ /g")"."${current_date}".txt";
output=$(echo $i | sed -e "s/_/ /g")".pgn";
for i in "${files[@]}"; do
file=${i//_/ }".${current_date}.txt";
output=${i//_/ }".pgn";
if [ -f "${file}" ]; then
echo "OK: $file";
rm -f "${output}";
for pgn in $(cat "${file}"); do
while read -r pgn; do
if [ -f "${folder}${pgn}" ]; then
cat "${folder}${pgn}" >> "${output}";
echo -n "."
else
echo "[!!] Missing ${folder}${pgn}";
fi;
done;
done <"${file}";
echo "[DONE]";
fi;
done;