Update Language\Encoding and phpunit tets

Fix missing replace char settings for conversion check call.
The php replace char method was never called. Also add standard type
settings next to char settings.
Return (get) call can either class set or current set in php

Fix mime encode with trailing space problem if length is on split
length. Mime encode uses \r\n for all line breaks now, can be controlled
via parameter
This commit is contained in:
Clemens Schwaighofer
2022-04-12 20:16:42 +09:00
parent d553c1364f
commit cc77d7e031
2 changed files with 322 additions and 28 deletions

View File

@@ -16,15 +16,262 @@ final class CoreLibsLanguageEncodingTest extends TestCase
/**
* Undocumented function
*
* @testdox Language\Encoding Class tests
* @return array
*/
public function mbMimeEncodeProvider(): array
{
return [
// 0: input string
// 1: encoding
// 2: expected
'standard UTF-8' => [
'Test string',
'UTF-8',
'Test string'
],
'long text UTF-8' => [
'The quick brown fox jumps over the lazy sheep that sleeps in the ravine '
. 'and has no idea what is going on here',
'UTF-8',
'The quick brown fox jumps over the lazy sheep that sleeps in the ravine '
. 'and has no idea what is going on here'
],
'standard with special chars UTF-8' => [
'This is ümläßtと漢字もカタカナ!^$%&',
'UTF-8',
'This is =?UTF-8?B?w7xtbMOkw59044Go5ryi5a2X44KC44Kr44K/44Kr44OK77yBIV4k?='
. "\r\n"
. ' =?UTF-8?B?JQ==?=&'
],
'35 chars and space at the end UTF-8' => [
'12345678901234567890123456789012345 '
. 'is there a space?',
'UTF-8',
'12345678901234567890123456789012345 '
. 'is there a =?UTF-8?B?c3BhY2U/?='
],
'36 chars and space at the end UTF-8' => [
'123456789012345678901234567890123456 '
. 'is there a space?',
'UTF-8',
'123456789012345678901234567890123456 '
. 'is there a =?UTF-8?B?c3BhY2U/?='
],
'36 kanji and space UTF-8' => [
'カタカナカタカナかなカタカナカタカナかなカタカナカタカナかなカタカナカタ '
. 'is there a space?',
'UTF-8',
"=?UTF-8?B?44Kr44K/44Kr44OK44Kr44K/44Kr44OK44GL44Gq44Kr44K/44Kr44OK44Kr?=\r\n"
. " =?UTF-8?B?44K/44Kr44OK?=\r\n"
. " =?UTF-8?B?44GL44Gq44Kr44K/44Kr44OK44Kr44K/44Kr44OK44GL44Gq44Kr44K/44Kr?=\r\n"
. " =?UTF-8?B?44OK44Kr44K/?= is there a =?UTF-8?B?c3BhY2U/?="
]
];
}
/**
* mb mime header encoding test
*
* @covers ::__mbMimeEncode
* @dataProvider mbMimeEncodeProvider
* @testdox mb encoding target $encoding [$_dataName]
*
* @return void
*/
public function testLanguageEncoding()
public function testUuMbMimeEncode(string $input, string $encoding, string $expected): void
{
$this->assertTrue(true, 'Language Encoding Tests not implemented');
$this->markTestIncomplete(
'Language\Encoding Tests have not yet been implemented'
// encode string first
$encoded = \CoreLibs\Language\Encoding::__mbMimeEncode($input, $encoding);
// print "MIME: -" . $encoded . "-\n";
$this->assertEquals(
$expected,
$encoded
);
$decoded = mb_decode_mimeheader($encoded);
// print "INPUT : " . $input . "\n";
// print "DECODED: " . $decoded . "\n";
// back compare decoded
$this->assertEquals(
$input,
$decoded
);
}
/**
* Undocumented function
*
* @return array
*/
public function convertEncodingProvider(): array
{
return [
// 0: original string
// 1: target encoding
// 2: optional source encoding
// 3: auto check (not used)
// 4: expected string
// 5: expected string encoding
'simple from UTF-8 to SJIS' => [
'input string',
'SJIS',
null,
null,
'input string',
'SJIS'
],
'kanji from UTF-8 to SJIS' => [
'日本語',
'SJIS',
null,
null,
'日本語',
'SJIS'
],
'kanji from UTF-8 to SJIS with source' => [
'日本語',
'SJIS',
'UTF-8',
null,
'日本語',
'SJIS'
],
];
}
/**
* Undocumented function
*
* @covers ::convertEncoding
* @dataProvider convertEncodingProvider
* @testdox convert encoding $target_encoding, source: $source_encoding, auto: $auto_check [$_dataName]
*
* @param string $input
* @param string $target_encoding
* @param string $source_encoding
* @param bool $auto_check
* @param string $expected
* @param string $expected_encoding
* @return void
*/
public function testConvertEncoding(
string $input,
string $target_encoding,
?string $source_encoding,
?bool $auto_check,
string $expected,
string $expected_encoding
): void {
if ($source_encoding === null and $auto_check === null) {
$string = \CoreLibs\Language\Encoding::convertEncoding($input, $target_encoding);
} elseif ($auto_check === null) {
$string = \CoreLibs\Language\Encoding::convertEncoding($input, $target_encoding, $source_encoding);
} else {
$string = \CoreLibs\Language\Encoding::convertEncoding(
$input,
$target_encoding,
$source_encoding,
$auto_check
);
}
// because we can't store encoding in here anyway
$target = mb_convert_encoding($expected, $expected_encoding, 'UTF-8');
// print "IN: $input, $target_encoding\n";
$this->assertEquals(
$target,
$string
);
}
/**
* Undocumented function
*
* @return array
*/
public function checkConvertEncodingProvider(): array
{
return [
// 0: string to test
// 1: source encoding
// 2: target encoding
// 3: substitue character
// 4: false for ok, array with error list
'valid test UTF-8 to SJIS (default)' => [
'日本語',
'UTF-8',
'SJIS',
null,
false
],
'invalid test UTF-8 to SJIS (dots)' => [
'❶',
'UTF-8',
'SJIS',
0x2234,
['❶']
],
'invalid test UTF-8 to SJIS (none)' => [
'❶',
'UTF-8',
'SJIS',
'none',
['❶']
],
'invalid test UTF-8 to SJIS (long)' => [
'❶',
'UTF-8',
'SJIS',
'long',
['❶']
],
'invalid test UTF-8 to SJIS (entity)' => [
'❶',
'UTF-8',
'SJIS',
'entity',
['❶']
],
];
}
/**
* Undocumented function
*
* @covers ::checkConvertEncoding
* @dataProvider checkConvertEncodingProvider
* @testdox check encoding convert from $from_encoding to $to_encoding [$_dataName]
*
* @param string $input
* @param string $from_encoding
* @param string $to_encoding
* @param string|int|null $error_char
* @param array|bool $expected
* @return void
*/
public function testCheckConvertEncoding(
string $input,
string $from_encoding,
string $to_encoding,
$error_char,
$expected
): void {
if ($error_char !== null) {
\CoreLibs\Language\Encoding::setErrorChar($error_char);
if (!in_array($error_char, ['none', 'long', 'entity'])) {
$this->assertEquals(
\IntlChar::chr($error_char),
\CoreLibs\Language\Encoding::getErrorChar()
);
} else {
$this->assertEquals(
$error_char,
\CoreLibs\Language\Encoding::getErrorChar()
);
}
}
$return = \CoreLibs\Language\Encoding::checkConvertEncoding($input, $from_encoding, $to_encoding);
$this->assertEquals(
$expected,
$return
);
}
}

View File

@@ -10,23 +10,32 @@ namespace CoreLibs\Language;
class Encoding
{
/** @var string */
/** @var int<min, -1>|int<1, max>|string */
private static $mb_error_char = '';
/**
* wrapper function for mb mime convert, for correct conversion with long strings
* @param string $string string to encode
* @param string $encoding target encoding
* @return string encoded string
* wrapper function for mb mime convert
* for correct conversion with long strings
*
* @param string $string string to encode
* @param string $encoding target encoding
* @param string $line_break default line break is \r\n
* @return string encoded string
*/
public static function __mbMimeEncode(string $string, string $encoding): string
{
public static function __mbMimeEncode(
string $string,
string $encoding,
string $line_break = "\r\n"
): string {
// set internal encoding, so the mimeheader encode works correctly
mb_internal_encoding($encoding);
// if a subject, make a work around for the broken mb_mimencode
$pos = 0;
$split = 36; // after 36 single bytes characters, if then comes MB, it is broken
// has to 2 x 36 < 74 so the mb_encode_mimeheader 74 hardcoded split does not get triggered
// after 36 single bytes characters,
// if then comes MB, it is broken
// has to 2 x 36 < 74 so the mb_encode_mimeheader
// 74 hardcoded split does not get triggered
$split = 36;
$_string = '';
while ($pos < mb_strlen($string, $encoding)) {
$output = mb_strimwidth($string, $pos, $split, "", $encoding);
@@ -39,38 +48,68 @@ class Encoding
// only make linebreaks if we have mime encoded code inside
// the space only belongs in the second line
if ($_string && preg_match("/^=\?/", $_string_encoded)) {
$_string .= "\n ";
$_string .= $line_break . " ";
} elseif (
// hack for plain text with space at the end
mb_strlen($output, $encoding) == $split &&
mb_substr($output, -1, 1, $encoding) == " "
) {
// if output ends with space, add one more
$_string_encoded .= " ";
}
$_string .= $_string_encoded;
}
// strip out any spaces BEFORE a line break
$string = str_replace(" \n", "\n", $_string);
$string = str_replace(" " . $line_break, $line_break, $_string);
return $string;
}
/**
* set error char
*
* @param string $string The character to use to represent error chars
* @param string|int|null $string The character to use to represent
* error chars
* "long" for long, "none" for none
* or a valid code point in int
* like 0x2234 (8756, ∴)
* default character is ? (63)
* if null is set then "none"
* @return void
*/
public static function setErrorChar(string $string): void
public static function setErrorChar($string): void
{
self::$mb_error_char = $string;
if (empty($string)) {
$string = 'none';
}
if (!in_array($string, ['none', 'long', 'entity'])) {
self::$mb_error_char = \IntlChar::chr($string);
} else {
self::$mb_error_char = $string;
}
mb_substitute_character($string);
}
/**
* get the current set error character
*
* @return string Set error character
* @param bool $return_substitute_func if set to true return the set
* character from the php function
* directly
* @return string|int Set error character
*/
public static function getErrorChar(): string
public static function getErrorChar(bool $return_substitute_func = false)
{
return self::$mb_error_char;
// return mb_substitute_character();
if ($return_substitute_func === true) {
return mb_substitute_character();
} else {
return self::$mb_error_char;
}
}
/**
* test if a string can be safely convert between encodings. mostly utf8 to shift jis
* test if a string can be safely convert between encodings.
* mostly utf8 to shift jis
* the default compare has a possibility of failure, especially with windows
* it is recommended to the following in the script which uses this method:
* mb_substitute_character(0x2234);
@@ -80,13 +119,17 @@ class Encoding
* if check to ISO-2022-JP-MS
* set three dots (∴) as wrong character for correct convert error detect
* (this char is used, because it is one of the least used ones)
*
* @param string $string string to test
* @param string $from_encoding encoding of string to test
* @param string $to_encoding target encoding
* @return bool|array<string> false if no error or array with failed characters
*/
public static function checkConvertEncoding(string $string, string $from_encoding, string $to_encoding)
{
public static function checkConvertEncoding(
string $string,
string $from_encoding,
string $to_encoding
) {
// convert to target encoding and convert back
$temp = mb_convert_encoding($string, $to_encoding, $from_encoding);
$compare = mb_convert_encoding($temp, $from_encoding, $to_encoding);
@@ -97,9 +140,11 @@ class Encoding
for ($i = 0, $iMax = mb_strlen($string, $from_encoding); $i < $iMax; $i++) {
$char = mb_substr($string, $i, 1, $from_encoding);
$r_char = mb_substr($compare, $i, 1, $from_encoding);
// the ord 194 is a hack to fix the IE7/IE8 bug with line break and illegal character
// the ord 194 is a hack to fix the IE7/IE8
// bug with line break and illegal character
if (
(($char != $r_char && !self::$mb_error_char) ||
(($char != $r_char && (!self::$mb_error_char ||
in_array(self::$mb_error_char, ['none', 'long', 'entity']))) ||
($char != $r_char && $r_char == self::$mb_error_char && self::$mb_error_char)) &&
ord($char) != 194
) {
@@ -118,6 +163,7 @@ class Encoding
* if source encoding is set and auto check is true (default) a second
* check is done so that the source string encoding actually matches
* will be skipped if source encoding detection is ascii
*
* @param string $string string to convert
* @param string $to_encoding target encoding
* @param string $source_encoding optional source encoding, will try to auto detect
@@ -144,7 +190,8 @@ class Encoding
$_source_encoding == $source_encoding
) {
// trigger check if we have override source encoding.
// if different (_source is all but not ascii) then trigger skip if matching
// if different (_source is all but not ascii) then trigger
// skip if matching
}
if ($source_encoding != $to_encoding) {
if ($source_encoding) {