Update on the CSV helper class with UTF detection for BOM reading

This commit is contained in:
Clemens Schwaighofer
2025-12-16 18:53:16 +09:00
parent 11a75d8532
commit cf575ded90

View File

@@ -7,10 +7,13 @@ from typing import Any, Sequence
from pathlib import Path from pathlib import Path
from collections import Counter from collections import Counter
import csv import csv
from corelibs.file_handling.file_bom_encoding import is_bom_encoded, is_bom_encoded_info
from corelibs.exceptions.csv_exceptions import ( from corelibs.exceptions.csv_exceptions import (
NoCsvReader, CompulsoryCsvHeaderCheckFailed, CsvHeaderDataMissing NoCsvReader, CompulsoryCsvHeaderCheckFailed, CsvHeaderDataMissing
) )
ENCODING = 'utf-8'
ENCODING_UTF8_SIG = 'utf-8-sig'
DELIMITER = "," DELIMITER = ","
QUOTECHAR = '"' QUOTECHAR = '"'
# type: _QuotingType # type: _QuotingType
@@ -27,6 +30,7 @@ class CsvWriter:
file_name: Path, file_name: Path,
header_mapping: dict[str, str], header_mapping: dict[str, str],
header_order: list[str] | None = None, header_order: list[str] | None = None,
encoding: str = ENCODING,
delimiter: str = DELIMITER, delimiter: str = DELIMITER,
quotechar: str = QUOTECHAR, quotechar: str = QUOTECHAR,
quoting: Any = QUOTING, quoting: Any = QUOTING,
@@ -38,6 +42,7 @@ class CsvWriter:
self.__delimiter = delimiter self.__delimiter = delimiter
self.__quotechar = quotechar self.__quotechar = quotechar
self.__quoting = quoting self.__quoting = quoting
self.__encoding = encoding
self.csv_file_writer = self.__open_csv(header_order) self.csv_file_writer = self.__open_csv(header_order)
def __open_csv(self, header_order: list[str] | None) -> csv.DictWriter[str]: def __open_csv(self, header_order: list[str] | None) -> csv.DictWriter[str]:
@@ -69,7 +74,8 @@ class CsvWriter:
try: try:
fp = open( fp = open(
self.__file_name, self.__file_name,
"w", encoding="utf-8" "w",
encoding=self.__encoding
) )
csv_file_writer = csv.DictWriter( csv_file_writer = csv.DictWriter(
fp, fp,
@@ -109,6 +115,7 @@ class CsvReader:
self, self,
file_name: Path, file_name: Path,
header_check: Sequence[str] | None = None, header_check: Sequence[str] | None = None,
encoding: str = ENCODING,
delimiter: str = DELIMITER, delimiter: str = DELIMITER,
quotechar: str = QUOTECHAR, quotechar: str = QUOTECHAR,
quoting: Any = QUOTING, quoting: Any = QUOTING,
@@ -118,6 +125,7 @@ class CsvReader:
self.__delimiter = delimiter self.__delimiter = delimiter
self.__quotechar = quotechar self.__quotechar = quotechar
self.__quoting = quoting self.__quoting = quoting
self.__encoding = encoding
self.header: Sequence[str] | None = None self.header: Sequence[str] | None = None
self.csv_file_reader = self.__open_csv() self.csv_file_reader = self.__open_csv()
@@ -129,9 +137,16 @@ class CsvReader:
csv.DictReader | None: _description_ csv.DictReader | None: _description_
""" """
try: try:
# if UTF style check if this is BOM
if self.__encoding.lower().startswith('utf-') and is_bom_encoded(self.__file_name):
bom_info = is_bom_encoded_info(self.__file_name)
if bom_info['encoding'] == 'utf-8':
self.__encoding = ENCODING_UTF8_SIG
else:
self.__encoding = bom_info['encoding'] or self.__encoding
fp = open( fp = open(
self.__file_name, self.__file_name,
"r", encoding="utf-8" "r", encoding=self.__encoding
) )
csv_file_reader = csv.DictReader( csv_file_reader = csv.DictReader(
fp, fp,