Update on the CSV helper class with UTF detection for BOM reading

This commit is contained in:
Clemens Schwaighofer
2025-12-16 18:53:16 +09:00
parent 11a75d8532
commit cf575ded90

View File

@@ -7,10 +7,13 @@ from typing import Any, Sequence
from pathlib import Path
from collections import Counter
import csv
from corelibs.file_handling.file_bom_encoding import is_bom_encoded, is_bom_encoded_info
from corelibs.exceptions.csv_exceptions import (
NoCsvReader, CompulsoryCsvHeaderCheckFailed, CsvHeaderDataMissing
)
ENCODING = 'utf-8'
ENCODING_UTF8_SIG = 'utf-8-sig'
DELIMITER = ","
QUOTECHAR = '"'
# type: _QuotingType
@@ -27,6 +30,7 @@ class CsvWriter:
file_name: Path,
header_mapping: dict[str, str],
header_order: list[str] | None = None,
encoding: str = ENCODING,
delimiter: str = DELIMITER,
quotechar: str = QUOTECHAR,
quoting: Any = QUOTING,
@@ -38,6 +42,7 @@ class CsvWriter:
self.__delimiter = delimiter
self.__quotechar = quotechar
self.__quoting = quoting
self.__encoding = encoding
self.csv_file_writer = self.__open_csv(header_order)
def __open_csv(self, header_order: list[str] | None) -> csv.DictWriter[str]:
@@ -69,7 +74,8 @@ class CsvWriter:
try:
fp = open(
self.__file_name,
"w", encoding="utf-8"
"w",
encoding=self.__encoding
)
csv_file_writer = csv.DictWriter(
fp,
@@ -109,6 +115,7 @@ class CsvReader:
self,
file_name: Path,
header_check: Sequence[str] | None = None,
encoding: str = ENCODING,
delimiter: str = DELIMITER,
quotechar: str = QUOTECHAR,
quoting: Any = QUOTING,
@@ -118,6 +125,7 @@ class CsvReader:
self.__delimiter = delimiter
self.__quotechar = quotechar
self.__quoting = quoting
self.__encoding = encoding
self.header: Sequence[str] | None = None
self.csv_file_reader = self.__open_csv()
@@ -129,9 +137,16 @@ class CsvReader:
csv.DictReader | None: _description_
"""
try:
# if UTF style check if this is BOM
if self.__encoding.lower().startswith('utf-') and is_bom_encoded(self.__file_name):
bom_info = is_bom_encoded_info(self.__file_name)
if bom_info['encoding'] == 'utf-8':
self.__encoding = ENCODING_UTF8_SIG
else:
self.__encoding = bom_info['encoding'] or self.__encoding
fp = open(
self.__file_name,
"r", encoding="utf-8"
"r", encoding=self.__encoding
)
csv_file_reader = csv.DictReader(
fp,