Update on the CSV helper class with UTF detection for BOM reading
This commit is contained in:
@@ -7,10 +7,13 @@ from typing import Any, Sequence
|
|||||||
from pathlib import Path
|
from pathlib import Path
|
||||||
from collections import Counter
|
from collections import Counter
|
||||||
import csv
|
import csv
|
||||||
|
from corelibs.file_handling.file_bom_encoding import is_bom_encoded, is_bom_encoded_info
|
||||||
from corelibs.exceptions.csv_exceptions import (
|
from corelibs.exceptions.csv_exceptions import (
|
||||||
NoCsvReader, CompulsoryCsvHeaderCheckFailed, CsvHeaderDataMissing
|
NoCsvReader, CompulsoryCsvHeaderCheckFailed, CsvHeaderDataMissing
|
||||||
)
|
)
|
||||||
|
|
||||||
|
ENCODING = 'utf-8'
|
||||||
|
ENCODING_UTF8_SIG = 'utf-8-sig'
|
||||||
DELIMITER = ","
|
DELIMITER = ","
|
||||||
QUOTECHAR = '"'
|
QUOTECHAR = '"'
|
||||||
# type: _QuotingType
|
# type: _QuotingType
|
||||||
@@ -27,6 +30,7 @@ class CsvWriter:
|
|||||||
file_name: Path,
|
file_name: Path,
|
||||||
header_mapping: dict[str, str],
|
header_mapping: dict[str, str],
|
||||||
header_order: list[str] | None = None,
|
header_order: list[str] | None = None,
|
||||||
|
encoding: str = ENCODING,
|
||||||
delimiter: str = DELIMITER,
|
delimiter: str = DELIMITER,
|
||||||
quotechar: str = QUOTECHAR,
|
quotechar: str = QUOTECHAR,
|
||||||
quoting: Any = QUOTING,
|
quoting: Any = QUOTING,
|
||||||
@@ -38,6 +42,7 @@ class CsvWriter:
|
|||||||
self.__delimiter = delimiter
|
self.__delimiter = delimiter
|
||||||
self.__quotechar = quotechar
|
self.__quotechar = quotechar
|
||||||
self.__quoting = quoting
|
self.__quoting = quoting
|
||||||
|
self.__encoding = encoding
|
||||||
self.csv_file_writer = self.__open_csv(header_order)
|
self.csv_file_writer = self.__open_csv(header_order)
|
||||||
|
|
||||||
def __open_csv(self, header_order: list[str] | None) -> csv.DictWriter[str]:
|
def __open_csv(self, header_order: list[str] | None) -> csv.DictWriter[str]:
|
||||||
@@ -69,7 +74,8 @@ class CsvWriter:
|
|||||||
try:
|
try:
|
||||||
fp = open(
|
fp = open(
|
||||||
self.__file_name,
|
self.__file_name,
|
||||||
"w", encoding="utf-8"
|
"w",
|
||||||
|
encoding=self.__encoding
|
||||||
)
|
)
|
||||||
csv_file_writer = csv.DictWriter(
|
csv_file_writer = csv.DictWriter(
|
||||||
fp,
|
fp,
|
||||||
@@ -109,6 +115,7 @@ class CsvReader:
|
|||||||
self,
|
self,
|
||||||
file_name: Path,
|
file_name: Path,
|
||||||
header_check: Sequence[str] | None = None,
|
header_check: Sequence[str] | None = None,
|
||||||
|
encoding: str = ENCODING,
|
||||||
delimiter: str = DELIMITER,
|
delimiter: str = DELIMITER,
|
||||||
quotechar: str = QUOTECHAR,
|
quotechar: str = QUOTECHAR,
|
||||||
quoting: Any = QUOTING,
|
quoting: Any = QUOTING,
|
||||||
@@ -118,6 +125,7 @@ class CsvReader:
|
|||||||
self.__delimiter = delimiter
|
self.__delimiter = delimiter
|
||||||
self.__quotechar = quotechar
|
self.__quotechar = quotechar
|
||||||
self.__quoting = quoting
|
self.__quoting = quoting
|
||||||
|
self.__encoding = encoding
|
||||||
self.header: Sequence[str] | None = None
|
self.header: Sequence[str] | None = None
|
||||||
self.csv_file_reader = self.__open_csv()
|
self.csv_file_reader = self.__open_csv()
|
||||||
|
|
||||||
@@ -129,9 +137,16 @@ class CsvReader:
|
|||||||
csv.DictReader | None: _description_
|
csv.DictReader | None: _description_
|
||||||
"""
|
"""
|
||||||
try:
|
try:
|
||||||
|
# if UTF style check if this is BOM
|
||||||
|
if self.__encoding.lower().startswith('utf-') and is_bom_encoded(self.__file_name):
|
||||||
|
bom_info = is_bom_encoded_info(self.__file_name)
|
||||||
|
if bom_info['encoding'] == 'utf-8':
|
||||||
|
self.__encoding = ENCODING_UTF8_SIG
|
||||||
|
else:
|
||||||
|
self.__encoding = bom_info['encoding'] or self.__encoding
|
||||||
fp = open(
|
fp = open(
|
||||||
self.__file_name,
|
self.__file_name,
|
||||||
"r", encoding="utf-8"
|
"r", encoding=self.__encoding
|
||||||
)
|
)
|
||||||
csv_file_reader = csv.DictReader(
|
csv_file_reader = csv.DictReader(
|
||||||
fp,
|
fp,
|
||||||
|
|||||||
Reference in New Issue
Block a user