|
|
|
|
@@ -0,0 +1,538 @@
|
|
|
|
|
"""
|
|
|
|
|
PyTest: file_handling/file_bom_encoding
|
|
|
|
|
"""
|
|
|
|
|
|
|
|
|
|
from pathlib import Path
|
|
|
|
|
import pytest
|
|
|
|
|
|
|
|
|
|
from corelibs.file_handling.file_bom_encoding import (
|
|
|
|
|
is_bom_encoded,
|
|
|
|
|
is_bom_encoded_info,
|
|
|
|
|
BomEncodingInfo,
|
|
|
|
|
)
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestIsBomEncoded:
|
|
|
|
|
"""Test suite for is_bom_encoded function"""
|
|
|
|
|
|
|
|
|
|
def test_utf8_bom_file(self, tmp_path: Path):
|
|
|
|
|
"""Test detection of UTF-8 BOM encoded file"""
|
|
|
|
|
test_file = tmp_path / "utf8_bom.txt"
|
|
|
|
|
# UTF-8 BOM: EF BB BF
|
|
|
|
|
content = b'\xef\xbb\xbfHello, World!'
|
|
|
|
|
test_file.write_bytes(content)
|
|
|
|
|
|
|
|
|
|
result = is_bom_encoded(test_file)
|
|
|
|
|
assert result is True
|
|
|
|
|
assert isinstance(result, bool)
|
|
|
|
|
|
|
|
|
|
def test_utf16_le_bom_file(self, tmp_path: Path):
|
|
|
|
|
"""Test detection of UTF-16 LE BOM encoded file"""
|
|
|
|
|
test_file = tmp_path / "utf16_le_bom.txt"
|
|
|
|
|
# UTF-16 LE BOM: FF FE
|
|
|
|
|
content = b'\xff\xfeH\x00e\x00l\x00l\x00o\x00'
|
|
|
|
|
test_file.write_bytes(content)
|
|
|
|
|
|
|
|
|
|
result = is_bom_encoded(test_file)
|
|
|
|
|
assert result is True
|
|
|
|
|
|
|
|
|
|
def test_utf16_be_bom_file(self, tmp_path: Path):
|
|
|
|
|
"""Test detection of UTF-16 BE BOM encoded file"""
|
|
|
|
|
test_file = tmp_path / "utf16_be_bom.txt"
|
|
|
|
|
# UTF-16 BE BOM: FE FF
|
|
|
|
|
content = b'\xfe\xff\x00H\x00e\x00l\x00l\x00o'
|
|
|
|
|
test_file.write_bytes(content)
|
|
|
|
|
|
|
|
|
|
result = is_bom_encoded(test_file)
|
|
|
|
|
assert result is True
|
|
|
|
|
|
|
|
|
|
def test_utf32_le_bom_file(self, tmp_path: Path):
|
|
|
|
|
"""Test detection of UTF-32 LE BOM encoded file"""
|
|
|
|
|
test_file = tmp_path / "utf32_le_bom.txt"
|
|
|
|
|
# UTF-32 LE BOM: FF FE 00 00
|
|
|
|
|
content = b'\xff\xfe\x00\x00H\x00\x00\x00e\x00\x00\x00'
|
|
|
|
|
test_file.write_bytes(content)
|
|
|
|
|
|
|
|
|
|
result = is_bom_encoded(test_file)
|
|
|
|
|
assert result is True
|
|
|
|
|
|
|
|
|
|
def test_utf32_be_bom_file(self, tmp_path: Path):
|
|
|
|
|
"""Test detection of UTF-32 BE BOM encoded file"""
|
|
|
|
|
test_file = tmp_path / "utf32_be_bom.txt"
|
|
|
|
|
# UTF-32 BE BOM: 00 00 FE FF
|
|
|
|
|
content = b'\x00\x00\xfe\xff\x00\x00\x00H\x00\x00\x00e'
|
|
|
|
|
test_file.write_bytes(content)
|
|
|
|
|
|
|
|
|
|
result = is_bom_encoded(test_file)
|
|
|
|
|
assert result is True
|
|
|
|
|
|
|
|
|
|
def test_no_bom_ascii_file(self, tmp_path: Path):
|
|
|
|
|
"""Test detection of ASCII file without BOM"""
|
|
|
|
|
test_file = tmp_path / "ascii.txt"
|
|
|
|
|
content = b'Hello, World!'
|
|
|
|
|
test_file.write_bytes(content)
|
|
|
|
|
|
|
|
|
|
result = is_bom_encoded(test_file)
|
|
|
|
|
assert result is False
|
|
|
|
|
|
|
|
|
|
def test_no_bom_utf8_file(self, tmp_path: Path):
|
|
|
|
|
"""Test detection of UTF-8 file without BOM"""
|
|
|
|
|
test_file = tmp_path / "utf8_no_bom.txt"
|
|
|
|
|
content = 'Hello, 世界!'.encode('utf-8')
|
|
|
|
|
test_file.write_bytes(content)
|
|
|
|
|
|
|
|
|
|
result = is_bom_encoded(test_file)
|
|
|
|
|
assert result is False
|
|
|
|
|
|
|
|
|
|
def test_empty_file(self, tmp_path: Path):
|
|
|
|
|
"""Test detection on empty file"""
|
|
|
|
|
test_file = tmp_path / "empty.txt"
|
|
|
|
|
test_file.write_bytes(b'')
|
|
|
|
|
|
|
|
|
|
result = is_bom_encoded(test_file)
|
|
|
|
|
assert result is False
|
|
|
|
|
|
|
|
|
|
def test_binary_file_no_bom(self, tmp_path: Path):
|
|
|
|
|
"""Test detection on binary file without BOM"""
|
|
|
|
|
test_file = tmp_path / "binary.bin"
|
|
|
|
|
content = bytes(range(256))
|
|
|
|
|
test_file.write_bytes(content)
|
|
|
|
|
|
|
|
|
|
result = is_bom_encoded(test_file)
|
|
|
|
|
assert result is False
|
|
|
|
|
|
|
|
|
|
def test_partial_bom_pattern(self, tmp_path: Path):
|
|
|
|
|
"""Test file with partial BOM pattern that shouldn't match"""
|
|
|
|
|
test_file = tmp_path / "partial_bom.txt"
|
|
|
|
|
# Only first two bytes of UTF-8 BOM
|
|
|
|
|
content = b'\xef\xbbHello'
|
|
|
|
|
test_file.write_bytes(content)
|
|
|
|
|
|
|
|
|
|
result = is_bom_encoded(test_file)
|
|
|
|
|
assert result is False
|
|
|
|
|
|
|
|
|
|
def test_false_positive_bom_pattern(self, tmp_path: Path):
|
|
|
|
|
"""Test file that contains BOM-like bytes but not at the start"""
|
|
|
|
|
test_file = tmp_path / "false_positive.txt"
|
|
|
|
|
content = b'Hello\xef\xbb\xbfWorld'
|
|
|
|
|
test_file.write_bytes(content)
|
|
|
|
|
|
|
|
|
|
result = is_bom_encoded(test_file)
|
|
|
|
|
assert result is False
|
|
|
|
|
|
|
|
|
|
def test_nonexistent_file(self, tmp_path: Path):
|
|
|
|
|
"""Test that function raises error for non-existent file"""
|
|
|
|
|
test_file = tmp_path / "nonexistent.txt"
|
|
|
|
|
|
|
|
|
|
with pytest.raises(ValueError, match="Error checking BOM encoding"):
|
|
|
|
|
is_bom_encoded(test_file)
|
|
|
|
|
|
|
|
|
|
def test_very_small_file(self, tmp_path: Path):
|
|
|
|
|
"""Test file smaller than largest BOM pattern (4 bytes)"""
|
|
|
|
|
test_file = tmp_path / "small.txt"
|
|
|
|
|
content = b'Hi'
|
|
|
|
|
test_file.write_bytes(content)
|
|
|
|
|
|
|
|
|
|
result = is_bom_encoded(test_file)
|
|
|
|
|
assert result is False
|
|
|
|
|
|
|
|
|
|
def test_exactly_bom_size_utf8(self, tmp_path: Path):
|
|
|
|
|
"""Test file that is exactly the size of UTF-8 BOM"""
|
|
|
|
|
test_file = tmp_path / "exact_bom.txt"
|
|
|
|
|
content = b'\xef\xbb\xbf'
|
|
|
|
|
test_file.write_bytes(content)
|
|
|
|
|
|
|
|
|
|
result = is_bom_encoded(test_file)
|
|
|
|
|
assert result is True
|
|
|
|
|
|
|
|
|
|
def test_exactly_bom_size_utf32(self, tmp_path: Path):
|
|
|
|
|
"""Test file that is exactly the size of UTF-32 BOM"""
|
|
|
|
|
test_file = tmp_path / "exact_bom_utf32.txt"
|
|
|
|
|
content = b'\xff\xfe\x00\x00'
|
|
|
|
|
test_file.write_bytes(content)
|
|
|
|
|
|
|
|
|
|
result = is_bom_encoded(test_file)
|
|
|
|
|
assert result is True
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestIsBomEncodedInfo:
|
|
|
|
|
"""Test suite for is_bom_encoded_info function"""
|
|
|
|
|
|
|
|
|
|
def test_utf8_bom_info(self, tmp_path: Path):
|
|
|
|
|
"""Test detailed info for UTF-8 BOM encoded file"""
|
|
|
|
|
test_file = tmp_path / "utf8_bom.txt"
|
|
|
|
|
content = b'\xef\xbb\xbfHello, UTF-8!'
|
|
|
|
|
test_file.write_bytes(content)
|
|
|
|
|
|
|
|
|
|
result = is_bom_encoded_info(test_file)
|
|
|
|
|
|
|
|
|
|
assert isinstance(result, dict)
|
|
|
|
|
assert result['has_bom'] is True
|
|
|
|
|
assert result['bom_type'] == 'UTF-8'
|
|
|
|
|
assert result['encoding'] == 'utf-8'
|
|
|
|
|
assert result['bom_length'] == 3
|
|
|
|
|
assert result['bom_pattern'] == b'\xef\xbb\xbf'
|
|
|
|
|
|
|
|
|
|
def test_utf16_le_bom_info(self, tmp_path: Path):
|
|
|
|
|
"""Test detailed info for UTF-16 LE BOM encoded file"""
|
|
|
|
|
test_file = tmp_path / "utf16_le_bom.txt"
|
|
|
|
|
content = b'\xff\xfeH\x00e\x00l\x00l\x00o\x00'
|
|
|
|
|
test_file.write_bytes(content)
|
|
|
|
|
|
|
|
|
|
result = is_bom_encoded_info(test_file)
|
|
|
|
|
|
|
|
|
|
assert result['has_bom'] is True
|
|
|
|
|
assert result['bom_type'] == 'UTF-16 LE'
|
|
|
|
|
assert result['encoding'] == 'utf-16-le'
|
|
|
|
|
assert result['bom_length'] == 2
|
|
|
|
|
assert result['bom_pattern'] == b'\xff\xfe'
|
|
|
|
|
|
|
|
|
|
def test_utf16_be_bom_info(self, tmp_path: Path):
|
|
|
|
|
"""Test detailed info for UTF-16 BE BOM encoded file"""
|
|
|
|
|
test_file = tmp_path / "utf16_be_bom.txt"
|
|
|
|
|
content = b'\xfe\xff\x00H\x00e\x00l\x00l\x00o'
|
|
|
|
|
test_file.write_bytes(content)
|
|
|
|
|
|
|
|
|
|
result = is_bom_encoded_info(test_file)
|
|
|
|
|
|
|
|
|
|
assert result['has_bom'] is True
|
|
|
|
|
assert result['bom_type'] == 'UTF-16 BE'
|
|
|
|
|
assert result['encoding'] == 'utf-16-be'
|
|
|
|
|
assert result['bom_length'] == 2
|
|
|
|
|
assert result['bom_pattern'] == b'\xfe\xff'
|
|
|
|
|
|
|
|
|
|
def test_utf32_le_bom_info(self, tmp_path: Path):
|
|
|
|
|
"""Test detailed info for UTF-32 LE BOM encoded file"""
|
|
|
|
|
test_file = tmp_path / "utf32_le_bom.txt"
|
|
|
|
|
content = b'\xff\xfe\x00\x00H\x00\x00\x00e\x00\x00\x00'
|
|
|
|
|
test_file.write_bytes(content)
|
|
|
|
|
|
|
|
|
|
result = is_bom_encoded_info(test_file)
|
|
|
|
|
|
|
|
|
|
assert result['has_bom'] is True
|
|
|
|
|
assert result['bom_type'] == 'UTF-32 LE'
|
|
|
|
|
assert result['encoding'] == 'utf-32-le'
|
|
|
|
|
assert result['bom_length'] == 4
|
|
|
|
|
assert result['bom_pattern'] == b'\xff\xfe\x00\x00'
|
|
|
|
|
|
|
|
|
|
def test_utf32_be_bom_info(self, tmp_path: Path):
|
|
|
|
|
"""Test detailed info for UTF-32 BE BOM encoded file"""
|
|
|
|
|
test_file = tmp_path / "utf32_be_bom.txt"
|
|
|
|
|
content = b'\x00\x00\xfe\xff\x00\x00\x00H\x00\x00\x00e'
|
|
|
|
|
test_file.write_bytes(content)
|
|
|
|
|
|
|
|
|
|
result = is_bom_encoded_info(test_file)
|
|
|
|
|
|
|
|
|
|
assert result['has_bom'] is True
|
|
|
|
|
assert result['bom_type'] == 'UTF-32 BE'
|
|
|
|
|
assert result['encoding'] == 'utf-32-be'
|
|
|
|
|
assert result['bom_length'] == 4
|
|
|
|
|
assert result['bom_pattern'] == b'\x00\x00\xfe\xff'
|
|
|
|
|
|
|
|
|
|
def test_no_bom_info(self, tmp_path: Path):
|
|
|
|
|
"""Test detailed info for file without BOM"""
|
|
|
|
|
test_file = tmp_path / "no_bom.txt"
|
|
|
|
|
content = b'Hello, World!'
|
|
|
|
|
test_file.write_bytes(content)
|
|
|
|
|
|
|
|
|
|
result = is_bom_encoded_info(test_file)
|
|
|
|
|
|
|
|
|
|
assert result['has_bom'] is False
|
|
|
|
|
assert result['bom_type'] is None
|
|
|
|
|
assert result['encoding'] is None
|
|
|
|
|
assert result['bom_length'] == 0
|
|
|
|
|
assert result['bom_pattern'] is None
|
|
|
|
|
|
|
|
|
|
def test_empty_file_info(self, tmp_path: Path):
|
|
|
|
|
"""Test detailed info for empty file"""
|
|
|
|
|
test_file = tmp_path / "empty.txt"
|
|
|
|
|
test_file.write_bytes(b'')
|
|
|
|
|
|
|
|
|
|
result = is_bom_encoded_info(test_file)
|
|
|
|
|
|
|
|
|
|
assert result['has_bom'] is False
|
|
|
|
|
assert result['bom_type'] is None
|
|
|
|
|
assert result['encoding'] is None
|
|
|
|
|
assert result['bom_length'] == 0
|
|
|
|
|
assert result['bom_pattern'] is None
|
|
|
|
|
|
|
|
|
|
def test_bom_precedence_utf32_vs_utf16(self, tmp_path: Path):
|
|
|
|
|
"""Test that UTF-32 LE BOM takes precedence over UTF-16 LE when both match"""
|
|
|
|
|
test_file = tmp_path / "precedence.txt"
|
|
|
|
|
# UTF-32 LE BOM starts with UTF-16 LE BOM pattern
|
|
|
|
|
content = b'\xff\xfe\x00\x00Additional content'
|
|
|
|
|
test_file.write_bytes(content)
|
|
|
|
|
|
|
|
|
|
result = is_bom_encoded_info(test_file)
|
|
|
|
|
|
|
|
|
|
# Should detect UTF-32 LE, not UTF-16 LE
|
|
|
|
|
assert result['has_bom'] is True
|
|
|
|
|
assert result['bom_type'] == 'UTF-32 LE'
|
|
|
|
|
assert result['encoding'] == 'utf-32-le'
|
|
|
|
|
assert result['bom_length'] == 4
|
|
|
|
|
assert result['bom_pattern'] == b'\xff\xfe\x00\x00'
|
|
|
|
|
|
|
|
|
|
def test_return_type_validation(self, tmp_path: Path):
|
|
|
|
|
"""Test that return type matches BomEncodingInfo TypedDict"""
|
|
|
|
|
test_file = tmp_path / "test.txt"
|
|
|
|
|
test_file.write_bytes(b'Test content')
|
|
|
|
|
|
|
|
|
|
result = is_bom_encoded_info(test_file)
|
|
|
|
|
|
|
|
|
|
# Check all required keys are present
|
|
|
|
|
required_keys = {'has_bom', 'bom_type', 'encoding', 'bom_length', 'bom_pattern'}
|
|
|
|
|
assert set(result.keys()) == required_keys
|
|
|
|
|
|
|
|
|
|
# Check types
|
|
|
|
|
assert isinstance(result['has_bom'], bool)
|
|
|
|
|
assert result['bom_type'] is None or isinstance(result['bom_type'], str)
|
|
|
|
|
assert result['encoding'] is None or isinstance(result['encoding'], str)
|
|
|
|
|
assert isinstance(result['bom_length'], int)
|
|
|
|
|
assert result['bom_pattern'] is None or isinstance(result['bom_pattern'], bytes)
|
|
|
|
|
|
|
|
|
|
def test_nonexistent_file_error(self, tmp_path: Path):
|
|
|
|
|
"""Test that function raises ValueError for non-existent file"""
|
|
|
|
|
test_file = tmp_path / "nonexistent.txt"
|
|
|
|
|
|
|
|
|
|
with pytest.raises(ValueError) as exc_info:
|
|
|
|
|
is_bom_encoded_info(test_file)
|
|
|
|
|
|
|
|
|
|
assert "Error checking BOM encoding" in str(exc_info.value)
|
|
|
|
|
|
|
|
|
|
def test_directory_instead_of_file(self, tmp_path: Path):
|
|
|
|
|
"""Test that function raises error when given a directory"""
|
|
|
|
|
test_dir = tmp_path / "test_dir"
|
|
|
|
|
test_dir.mkdir()
|
|
|
|
|
|
|
|
|
|
with pytest.raises(ValueError, match="Error checking BOM encoding"):
|
|
|
|
|
is_bom_encoded_info(test_dir)
|
|
|
|
|
|
|
|
|
|
def test_large_file_with_bom(self, tmp_path: Path):
|
|
|
|
|
"""Test BOM detection on large file (only first 4 bytes matter)"""
|
|
|
|
|
test_file = tmp_path / "large_bom.txt"
|
|
|
|
|
# UTF-8 BOM followed by large content
|
|
|
|
|
content = b'\xef\xbb\xbf' + b'A' * 100000
|
|
|
|
|
test_file.write_bytes(content)
|
|
|
|
|
|
|
|
|
|
result = is_bom_encoded_info(test_file)
|
|
|
|
|
|
|
|
|
|
assert result['has_bom'] is True
|
|
|
|
|
assert result['bom_type'] == 'UTF-8'
|
|
|
|
|
assert result['encoding'] == 'utf-8'
|
|
|
|
|
|
|
|
|
|
def test_bom_detection_priority_order(self, tmp_path: Path):
|
|
|
|
|
"""Test that BOM patterns are checked in the correct priority order"""
|
|
|
|
|
# The function should check longer patterns first to avoid false matches
|
|
|
|
|
test_cases = [
|
|
|
|
|
(b'\xff\xfe\x00\x00', 'UTF-32 LE'), # 4 bytes
|
|
|
|
|
(b'\x00\x00\xfe\xff', 'UTF-32 BE'), # 4 bytes
|
|
|
|
|
(b'\xff\xfe', 'UTF-16 LE'), # 2 bytes
|
|
|
|
|
(b'\xfe\xff', 'UTF-16 BE'), # 2 bytes
|
|
|
|
|
(b'\xef\xbb\xbf', 'UTF-8'), # 3 bytes
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
for i, (bom_bytes, expected_type) in enumerate(test_cases):
|
|
|
|
|
test_file = tmp_path / f"priority_test_{i}.txt"
|
|
|
|
|
content = bom_bytes + b'Content'
|
|
|
|
|
test_file.write_bytes(content)
|
|
|
|
|
|
|
|
|
|
result = is_bom_encoded_info(test_file)
|
|
|
|
|
assert result['bom_type'] == expected_type
|
|
|
|
|
assert result['bom_pattern'] == bom_bytes
|
|
|
|
|
|
|
|
|
|
def test_csv_file_with_utf8_bom(self, tmp_path: Path):
|
|
|
|
|
"""Test CSV file with UTF-8 BOM (common use case mentioned in docstring)"""
|
|
|
|
|
test_file = tmp_path / "data.csv"
|
|
|
|
|
content = b'\xef\xbb\xbf"Name","Age","City"\n"John",30,"New York"\n"Jane",25,"Tokyo"'
|
|
|
|
|
test_file.write_bytes(content)
|
|
|
|
|
|
|
|
|
|
result = is_bom_encoded_info(test_file)
|
|
|
|
|
|
|
|
|
|
assert result['has_bom'] is True
|
|
|
|
|
assert result['bom_type'] == 'UTF-8'
|
|
|
|
|
assert result['encoding'] == 'utf-8'
|
|
|
|
|
assert result['bom_length'] == 3
|
|
|
|
|
|
|
|
|
|
def test_csv_file_without_bom(self, tmp_path: Path):
|
|
|
|
|
"""Test CSV file without BOM"""
|
|
|
|
|
test_file = tmp_path / "data_no_bom.csv"
|
|
|
|
|
content = b'"Name","Age","City"\n"John",30,"New York"\n"Jane",25,"Tokyo"'
|
|
|
|
|
test_file.write_bytes(content)
|
|
|
|
|
|
|
|
|
|
result = is_bom_encoded_info(test_file)
|
|
|
|
|
|
|
|
|
|
assert result['has_bom'] is False
|
|
|
|
|
assert result['bom_type'] is None
|
|
|
|
|
assert result['encoding'] is None
|
|
|
|
|
assert result['bom_length'] == 0
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestBomEncodingInfo:
|
|
|
|
|
"""Test suite for BomEncodingInfo TypedDict"""
|
|
|
|
|
|
|
|
|
|
def test_typed_dict_structure(self):
|
|
|
|
|
"""Test that BomEncodingInfo has correct structure"""
|
|
|
|
|
# This is a type check - in actual usage, mypy would validate this
|
|
|
|
|
sample_info: BomEncodingInfo = {
|
|
|
|
|
'has_bom': True,
|
|
|
|
|
'bom_type': 'UTF-8',
|
|
|
|
|
'encoding': 'utf-8',
|
|
|
|
|
'bom_length': 3,
|
|
|
|
|
'bom_pattern': b'\xef\xbb\xbf'
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
assert sample_info['has_bom'] is True
|
|
|
|
|
assert sample_info['bom_type'] == 'UTF-8'
|
|
|
|
|
assert sample_info['encoding'] == 'utf-8'
|
|
|
|
|
assert sample_info['bom_length'] == 3
|
|
|
|
|
assert sample_info['bom_pattern'] == b'\xef\xbb\xbf'
|
|
|
|
|
|
|
|
|
|
def test_typed_dict_none_values(self):
|
|
|
|
|
"""Test TypedDict with None values"""
|
|
|
|
|
sample_info: BomEncodingInfo = {
|
|
|
|
|
'has_bom': False,
|
|
|
|
|
'bom_type': None,
|
|
|
|
|
'encoding': None,
|
|
|
|
|
'bom_length': 0,
|
|
|
|
|
'bom_pattern': None
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
assert sample_info['has_bom'] is False
|
|
|
|
|
assert sample_info['bom_type'] is None
|
|
|
|
|
assert sample_info['encoding'] is None
|
|
|
|
|
assert sample_info['bom_length'] == 0
|
|
|
|
|
assert sample_info['bom_pattern'] is None
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
class TestIntegration:
|
|
|
|
|
"""Integration tests for BOM encoding detection"""
|
|
|
|
|
|
|
|
|
|
def test_is_bom_encoded_uses_info_function(self, tmp_path: Path):
|
|
|
|
|
"""Test that is_bom_encoded uses is_bom_encoded_info internally"""
|
|
|
|
|
test_file = tmp_path / "integration.txt"
|
|
|
|
|
content = b'\xef\xbb\xbfIntegration test'
|
|
|
|
|
test_file.write_bytes(content)
|
|
|
|
|
|
|
|
|
|
# Both functions should return consistent results
|
|
|
|
|
simple_result = is_bom_encoded(test_file)
|
|
|
|
|
detailed_result = is_bom_encoded_info(test_file)
|
|
|
|
|
|
|
|
|
|
assert simple_result == detailed_result['has_bom']
|
|
|
|
|
assert simple_result is True
|
|
|
|
|
|
|
|
|
|
def test_multiple_file_bom_detection_workflow(self, tmp_path: Path):
|
|
|
|
|
"""Test a workflow of detecting BOM across multiple files"""
|
|
|
|
|
files = {
|
|
|
|
|
'utf8_bom.csv': b'\xef\xbb\xbf"data","value"\n"test",123',
|
|
|
|
|
'utf16_le.txt': b'\xff\xfeH\x00e\x00l\x00l\x00o\x00',
|
|
|
|
|
'no_bom.txt': b'Plain ASCII text',
|
|
|
|
|
'empty.txt': b'',
|
|
|
|
|
}
|
|
|
|
|
|
|
|
|
|
results = {}
|
|
|
|
|
detailed_results = {}
|
|
|
|
|
|
|
|
|
|
for filename, content in files.items():
|
|
|
|
|
file_path = tmp_path / filename
|
|
|
|
|
file_path.write_bytes(content)
|
|
|
|
|
|
|
|
|
|
results[filename] = is_bom_encoded(file_path)
|
|
|
|
|
detailed_results[filename] = is_bom_encoded_info(file_path)
|
|
|
|
|
|
|
|
|
|
# Verify results
|
|
|
|
|
assert results['utf8_bom.csv'] is True
|
|
|
|
|
assert results['utf16_le.txt'] is True
|
|
|
|
|
assert results['no_bom.txt'] is False
|
|
|
|
|
assert results['empty.txt'] is False
|
|
|
|
|
|
|
|
|
|
# Verify detailed results match simple results
|
|
|
|
|
for filename in files:
|
|
|
|
|
assert results[filename] == detailed_results[filename]['has_bom']
|
|
|
|
|
|
|
|
|
|
# Verify specific encoding details
|
|
|
|
|
assert detailed_results['utf8_bom.csv']['encoding'] == 'utf-8'
|
|
|
|
|
assert detailed_results['utf16_le.txt']['encoding'] == 'utf-16-le'
|
|
|
|
|
assert detailed_results['no_bom.txt']['encoding'] is None
|
|
|
|
|
|
|
|
|
|
def test_csv_loading_workflow(self, tmp_path: Path):
|
|
|
|
|
"""Test BOM detection workflow for CSV loading (main use case)"""
|
|
|
|
|
# Create CSV files with and without BOM
|
|
|
|
|
csv_with_bom = tmp_path / "data_with_bom.csv"
|
|
|
|
|
csv_without_bom = tmp_path / "data_without_bom.csv"
|
|
|
|
|
|
|
|
|
|
# CSV with UTF-8 BOM
|
|
|
|
|
bom_content = b'\xef\xbb\xbf"Name","Age"\n"Alice",30\n"Bob",25'
|
|
|
|
|
csv_with_bom.write_bytes(bom_content)
|
|
|
|
|
|
|
|
|
|
# CSV without BOM
|
|
|
|
|
no_bom_content = b'"Name","Age"\n"Charlie",35\n"Diana",28'
|
|
|
|
|
csv_without_bom.write_bytes(no_bom_content)
|
|
|
|
|
|
|
|
|
|
# Simulate CSV loading workflow
|
|
|
|
|
files_to_process = [csv_with_bom, csv_without_bom]
|
|
|
|
|
processing_info: list[dict[str, str | bool | int]] = []
|
|
|
|
|
|
|
|
|
|
for csv_file in files_to_process:
|
|
|
|
|
bom_info = is_bom_encoded_info(csv_file)
|
|
|
|
|
|
|
|
|
|
file_info: dict[str, str | bool | int] = {
|
|
|
|
|
'file': csv_file.name,
|
|
|
|
|
'has_bom': bom_info['has_bom'],
|
|
|
|
|
'encoding': bom_info['encoding'] or 'default',
|
|
|
|
|
'skip_bytes': bom_info['bom_length']
|
|
|
|
|
}
|
|
|
|
|
processing_info.append(file_info)
|
|
|
|
|
|
|
|
|
|
# Verify workflow results
|
|
|
|
|
assert len(processing_info) == 2
|
|
|
|
|
|
|
|
|
|
bom_file_info = next(info for info in processing_info if info['file'] == 'data_with_bom.csv')
|
|
|
|
|
no_bom_file_info = next(info for info in processing_info if info['file'] == 'data_without_bom.csv')
|
|
|
|
|
|
|
|
|
|
assert bom_file_info['has_bom'] is True
|
|
|
|
|
assert bom_file_info['encoding'] == 'utf-8'
|
|
|
|
|
assert bom_file_info['skip_bytes'] == 3
|
|
|
|
|
|
|
|
|
|
assert no_bom_file_info['has_bom'] is False
|
|
|
|
|
assert no_bom_file_info['encoding'] == 'default'
|
|
|
|
|
assert no_bom_file_info['skip_bytes'] == 0
|
|
|
|
|
|
|
|
|
|
def test_error_handling_consistency(self, tmp_path: Path):
|
|
|
|
|
"""Test that both functions handle errors consistently"""
|
|
|
|
|
nonexistent_file = tmp_path / "does_not_exist.txt"
|
|
|
|
|
|
|
|
|
|
# Both functions should raise ValueError for non-existent files
|
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
|
is_bom_encoded(nonexistent_file)
|
|
|
|
|
|
|
|
|
|
with pytest.raises(ValueError):
|
|
|
|
|
is_bom_encoded_info(nonexistent_file)
|
|
|
|
|
|
|
|
|
|
def test_all_supported_bom_types(self, tmp_path: Path):
|
|
|
|
|
"""Test detection of all supported BOM types"""
|
|
|
|
|
bom_test_cases = [
|
|
|
|
|
('utf8', b'\xef\xbb\xbf', 'UTF-8', 'utf-8', 3),
|
|
|
|
|
('utf16_le', b'\xff\xfe', 'UTF-16 LE', 'utf-16-le', 2),
|
|
|
|
|
('utf16_be', b'\xfe\xff', 'UTF-16 BE', 'utf-16-be', 2),
|
|
|
|
|
('utf32_le', b'\xff\xfe\x00\x00', 'UTF-32 LE', 'utf-32-le', 4),
|
|
|
|
|
('utf32_be', b'\x00\x00\xfe\xff', 'UTF-32 BE', 'utf-32-be', 4),
|
|
|
|
|
]
|
|
|
|
|
|
|
|
|
|
for name, bom_bytes, expected_type, expected_encoding, expected_length in bom_test_cases:
|
|
|
|
|
test_file = tmp_path / f"{name}_test.txt"
|
|
|
|
|
content = bom_bytes + b'Test content'
|
|
|
|
|
test_file.write_bytes(content)
|
|
|
|
|
|
|
|
|
|
# Test simple function
|
|
|
|
|
assert is_bom_encoded(test_file) is True
|
|
|
|
|
|
|
|
|
|
# Test detailed function
|
|
|
|
|
info = is_bom_encoded_info(test_file)
|
|
|
|
|
assert info['has_bom'] is True
|
|
|
|
|
assert info['bom_type'] == expected_type
|
|
|
|
|
assert info['encoding'] == expected_encoding
|
|
|
|
|
assert info['bom_length'] == expected_length
|
|
|
|
|
assert info['bom_pattern'] == bom_bytes
|
|
|
|
|
|
|
|
|
|
|
|
|
|
|
# __END__
|