From cf575ded909fe2fedd7ee8d0685f6a396b5728cf Mon Sep 17 00:00:00 2001 From: Clemens Schwaighofer Date: Tue, 16 Dec 2025 18:53:16 +0900 Subject: [PATCH] Update on the CSV helper class with UTF detection for BOM reading --- src/corelibs/csv_handling/csv_interface.py | 19 +++++++++++++++++-- 1 file changed, 17 insertions(+), 2 deletions(-) diff --git a/src/corelibs/csv_handling/csv_interface.py b/src/corelibs/csv_handling/csv_interface.py index 1571ae8..4a51d44 100644 --- a/src/corelibs/csv_handling/csv_interface.py +++ b/src/corelibs/csv_handling/csv_interface.py @@ -7,10 +7,13 @@ from typing import Any, Sequence from pathlib import Path from collections import Counter import csv +from corelibs.file_handling.file_bom_encoding import is_bom_encoded, is_bom_encoded_info from corelibs.exceptions.csv_exceptions import ( NoCsvReader, CompulsoryCsvHeaderCheckFailed, CsvHeaderDataMissing ) +ENCODING = 'utf-8' +ENCODING_UTF8_SIG = 'utf-8-sig' DELIMITER = "," QUOTECHAR = '"' # type: _QuotingType @@ -27,6 +30,7 @@ class CsvWriter: file_name: Path, header_mapping: dict[str, str], header_order: list[str] | None = None, + encoding: str = ENCODING, delimiter: str = DELIMITER, quotechar: str = QUOTECHAR, quoting: Any = QUOTING, @@ -38,6 +42,7 @@ class CsvWriter: self.__delimiter = delimiter self.__quotechar = quotechar self.__quoting = quoting + self.__encoding = encoding self.csv_file_writer = self.__open_csv(header_order) def __open_csv(self, header_order: list[str] | None) -> csv.DictWriter[str]: @@ -69,7 +74,8 @@ class CsvWriter: try: fp = open( self.__file_name, - "w", encoding="utf-8" + "w", + encoding=self.__encoding ) csv_file_writer = csv.DictWriter( fp, @@ -109,6 +115,7 @@ class CsvReader: self, file_name: Path, header_check: Sequence[str] | None = None, + encoding: str = ENCODING, delimiter: str = DELIMITER, quotechar: str = QUOTECHAR, quoting: Any = QUOTING, @@ -118,6 +125,7 @@ class CsvReader: self.__delimiter = delimiter self.__quotechar = quotechar self.__quoting = quoting + self.__encoding = encoding self.header: Sequence[str] | None = None self.csv_file_reader = self.__open_csv() @@ -129,9 +137,16 @@ class CsvReader: csv.DictReader | None: _description_ """ try: + # if UTF style check if this is BOM + if self.__encoding.lower().startswith('utf-') and is_bom_encoded(self.__file_name): + bom_info = is_bom_encoded_info(self.__file_name) + if bom_info['encoding'] == 'utf-8': + self.__encoding = ENCODING_UTF8_SIG + else: + self.__encoding = bom_info['encoding'] or self.__encoding fp = open( self.__file_name, - "r", encoding="utf-8" + "r", encoding=self.__encoding ) csv_file_reader = csv.DictReader( fp,