Update Language\Encoding and phpunit tets

Fix missing replace char settings for conversion check call. The php replace char method was never called. Also add standard type settings next to char settings. Return (get) call can either class set or current set in php Fix mime encode with trailing space problem if length is on split length. Mime encode uses \r\n for all line breaks now, can be controlled via parameter
2022-04-12 20:16:42 +09:00
parent d553c1364f
commit cc77d7e031
2 changed files with 322 additions and 28 deletions
--- a/4dev/tests/CoreLibsLanguageEncodingTest.php
+++ b/4dev/tests/CoreLibsLanguageEncodingTest.php
@@ -16,15 +16,262 @@ final class CoreLibsLanguageEncodingTest extends TestCase
 	/**
 	 * Undocumented function
 	 *
-	 * @testdox Language\Encoding Class tests
+	 * @return array
+	 */
+	public function mbMimeEncodeProvider(): array
+	{
+		return [
+			// 0: input string
+			// 1: encoding
+			// 2: expected
+			'standard UTF-8' => [
+				'Test string',
+				'UTF-8',
+				'Test string'
+			],
+			'long text UTF-8' => [
+				'The quick brown fox jumps over the lazy sheep that sleeps in the ravine '
+					. 'and has no idea what is going on here',
+				'UTF-8',
+				'The quick brown fox jumps over the lazy sheep that sleeps in the ravine '
+					. 'and has no idea what is going on here'
+			],
+			'standard with special chars UTF-8' => [
+				'This is ümläßtと漢字もカタカナ！!^$%&',
+				'UTF-8',
+				'This is =?UTF-8?B?w7xtbMOkw59044Go5ryi5a2X44KC44Kr44K/44Kr44OK77yBIV4k?='
+					. "\r\n"
+					. ' =?UTF-8?B?JQ==?=&'
+			],
+			'35 chars and space at the end UTF-8' => [
+				'12345678901234567890123456789012345 '
+					. 'is there a space?',
+				'UTF-8',
+				'12345678901234567890123456789012345 '
+					. 'is there a =?UTF-8?B?c3BhY2U/?='
+			],
+			'36 chars and space at the end UTF-8' => [
+				'123456789012345678901234567890123456 '
+					. 'is there a space?',
+				'UTF-8',
+				'123456789012345678901234567890123456 '
+					. 'is there a =?UTF-8?B?c3BhY2U/?='
+			],
+			'36 kanji and space UTF-8' => [
+				'カタカナカタカナかなカタカナカタカナかなカタカナカタカナかなカタカナカタ '
+					. 'is there a space?',
+				'UTF-8',
+				"=?UTF-8?B?44Kr44K/44Kr44OK44Kr44K/44Kr44OK44GL44Gq44Kr44K/44Kr44OK44Kr?=\r\n"
+					. " =?UTF-8?B?44K/44Kr44OK?=\r\n"
+					. " =?UTF-8?B?44GL44Gq44Kr44K/44Kr44OK44Kr44K/44Kr44OK44GL44Gq44Kr44K/44Kr?=\r\n"
+					. " =?UTF-8?B?44OK44Kr44K/?= is there a =?UTF-8?B?c3BhY2U/?="
+			]
+		];
+	}
+
+	/**
+	 * mb mime header encoding test
+	 *
+	 * @covers ::__mbMimeEncode
+	 * @dataProvider mbMimeEncodeProvider
+	 * @testdox mb encoding target $encoding [$_dataName]
 	 *
 	 * @return void
 	 */
-	public function testLanguageEncoding()
+	public function testUuMbMimeEncode(string $input, string $encoding, string $expected): void
 	{
-		$this->assertTrue(true, 'Language Encoding Tests not implemented');
-		$this->markTestIncomplete(
-			'Language\Encoding Tests have not yet been implemented'
+		// encode string first
+		$encoded = \CoreLibs\Language\Encoding::__mbMimeEncode($input, $encoding);
+		// print "MIME: -" . $encoded . "-\n";
+		$this->assertEquals(
+			$expected,
+			$encoded
+		);
+		$decoded = mb_decode_mimeheader($encoded);
+		// print "INPUT  : " . $input . "\n";
+		// print "DECODED: " . $decoded . "\n";
+		// back compare decoded
+		$this->assertEquals(
+			$input,
+			$decoded
+		);
+	}
+
+	/**
+	 * Undocumented function
+	 *
+	 * @return array
+	 */
+	public function convertEncodingProvider(): array
+	{
+		return [
+			// 0: original string
+			// 1: target encoding
+			// 2: optional source encoding
+			// 3: auto check (not used)
+			// 4: expected string
+			// 5: expected string encoding
+			'simple from UTF-8 to SJIS' => [
+				'input string',
+				'SJIS',
+				null,
+				null,
+				'input string',
+				'SJIS'
+			],
+			'kanji from UTF-8 to SJIS' => [
+				'日本語',
+				'SJIS',
+				null,
+				null,
+				'日本語',
+				'SJIS'
+			],
+			'kanji from UTF-8 to SJIS with source' => [
+				'日本語',
+				'SJIS',
+				'UTF-8',
+				null,
+				'日本語',
+				'SJIS'
+			],
+		];
+	}
+
+	/**
+	 * Undocumented function
+	 *
+	 * @covers ::convertEncoding
+	 * @dataProvider convertEncodingProvider
+	 * @testdox convert encoding $target_encoding, source: $source_encoding, auto: $auto_check [$_dataName]
+	 *
+	 * @param  string $input
+	 * @param  string $target_encoding
+	 * @param  string $source_encoding
+	 * @param  bool   $auto_check
+	 * @param  string $expected
+	 * @param  string $expected_encoding
+	 * @return void
+	 */
+	public function testConvertEncoding(
+		string $input,
+		string $target_encoding,
+		?string $source_encoding,
+		?bool $auto_check,
+		string $expected,
+		string $expected_encoding
+	): void {
+		if ($source_encoding === null and $auto_check === null) {
+			$string = \CoreLibs\Language\Encoding::convertEncoding($input, $target_encoding);
+		} elseif ($auto_check === null) {
+			$string = \CoreLibs\Language\Encoding::convertEncoding($input, $target_encoding, $source_encoding);
+		} else {
+			$string = \CoreLibs\Language\Encoding::convertEncoding(
+				$input,
+				$target_encoding,
+				$source_encoding,
+				$auto_check
+			);
+		}
+		// because we can't store encoding in here anyway
+		$target = mb_convert_encoding($expected, $expected_encoding, 'UTF-8');
+		// print "IN: $input, $target_encoding\n";
+		$this->assertEquals(
+			$target,
+			$string
+		);
+	}
+
+	/**
+	 * Undocumented function
+	 *
+	 * @return array
+	 */
+	public function checkConvertEncodingProvider(): array
+	{
+		return [
+			// 0: string to test
+			// 1: source encoding
+			// 2: target encoding
+			// 3: substitue character
+			// 4: false for ok, array with error list
+			'valid test UTF-8 to SJIS (default)' => [
+				'日本語',
+				'UTF-8',
+				'SJIS',
+				null,
+				false
+			],
+			'invalid test UTF-8 to SJIS (dots)' => [
+				'❶',
+				'UTF-8',
+				'SJIS',
+				0x2234,
+				['❶']
+			],
+			'invalid test UTF-8 to SJIS (none)' => [
+				'❶',
+				'UTF-8',
+				'SJIS',
+				'none',
+				['❶']
+			],
+			'invalid test UTF-8 to SJIS (long)' => [
+				'❶',
+				'UTF-8',
+				'SJIS',
+				'long',
+				['❶']
+			],
+			'invalid test UTF-8 to SJIS (entity)' => [
+				'❶',
+				'UTF-8',
+				'SJIS',
+				'entity',
+				['❶']
+			],
+		];
+	}
+
+	/**
+	 * Undocumented function
+	 *
+	 * @covers ::checkConvertEncoding
+	 * @dataProvider checkConvertEncodingProvider
+	 * @testdox check encoding convert from $from_encoding to $to_encoding [$_dataName]
+	 *
+	 * @param  string          $input
+	 * @param  string          $from_encoding
+	 * @param  string          $to_encoding
+	 * @param  string|int|null $error_char
+	 * @param  array|bool      $expected
+	 * @return void
+	 */
+	public function testCheckConvertEncoding(
+		string $input,
+		string $from_encoding,
+		string $to_encoding,
+		$error_char,
+		$expected
+	): void {
+		if ($error_char !== null) {
+			\CoreLibs\Language\Encoding::setErrorChar($error_char);
+			if (!in_array($error_char, ['none', 'long', 'entity'])) {
+				$this->assertEquals(
+					\IntlChar::chr($error_char),
+					\CoreLibs\Language\Encoding::getErrorChar()
+				);
+			} else {
+				$this->assertEquals(
+					$error_char,
+					\CoreLibs\Language\Encoding::getErrorChar()
+				);
+			}
+		}
+		$return = \CoreLibs\Language\Encoding::checkConvertEncoding($input, $from_encoding, $to_encoding);
+		$this->assertEquals(
+			$expected,
+			$return
 		);
 	}
 }
--- a/www/lib/CoreLibs/Language/Encoding.php
+++ b/www/lib/CoreLibs/Language/Encoding.php
@@ -10,23 +10,32 @@ namespace CoreLibs\Language;

 class Encoding
 {
-	/** @var string */
+	/** @var int<min, -1>|int<1, max>|string */
 	private static $mb_error_char = '';

 	/**
-	 * wrapper function for mb mime convert, for correct conversion with long strings
-	 * @param  string $string   string to encode
-	 * @param  string $encoding target encoding
-	 * @return string           encoded string
+	 * wrapper function for mb mime convert
+	 * for correct conversion with long strings
+	 *
+	 * @param  string $string     string to encode
+	 * @param  string $encoding   target encoding
+	 * @param  string $line_break default line break is \r\n
+	 * @return string             encoded string
 	 */
-	public static function __mbMimeEncode(string $string, string $encoding): string
-	{
+	public static function __mbMimeEncode(
+		string $string,
+		string $encoding,
+		string $line_break = "\r\n"
+	): string {
 		// set internal encoding, so the mimeheader encode works correctly
 		mb_internal_encoding($encoding);
 		// if a subject, make a work around for the broken mb_mimencode
 		$pos = 0;
-		$split = 36; // after 36 single bytes characters, if then comes MB, it is broken
-					 // has to 2 x 36 < 74 so the mb_encode_mimeheader 74 hardcoded split does not get triggered
+		// after 36 single bytes characters,
+		// if then comes MB, it is broken
+		// has to 2 x 36 < 74 so the mb_encode_mimeheader
+		// 74 hardcoded split does not get triggered
+		$split = 36;
 		$_string = '';
 		while ($pos < mb_strlen($string, $encoding)) {
 			$output = mb_strimwidth($string, $pos, $split, "", $encoding);
@@ -39,38 +48,68 @@ class Encoding
 			// only make linebreaks if we have mime encoded code inside
 			// the space only belongs in the second line
 			if ($_string && preg_match("/^=\?/", $_string_encoded)) {
-				$_string .= "\n ";
+				$_string .= $line_break . " ";
+			} elseif (
+				// hack for plain text with space at the end
+				mb_strlen($output, $encoding) == $split &&
+				mb_substr($output, -1, 1, $encoding) == " "
+			) {
+				// if output ends with space, add one more
+				$_string_encoded .= " ";
 			}
 			$_string .= $_string_encoded;
 		}
 		// strip out any spaces BEFORE a line break
-		$string = str_replace(" \n", "\n", $_string);
+		$string = str_replace(" " . $line_break, $line_break, $_string);
 		return $string;
 	}

 	/**
 	 * set error char
 	 *
-	 * @param  string $string The character to use to represent error chars
+	 * @param  string|int|null $string The character to use to represent
+	 *                                 error chars
+	 *                                 "long" for long, "none" for none
+	 *                                 or a valid code point in int
+	 *                                 like 0x2234 (8756, ∴)
+	 *                                 default character is ? (63)
+	 *                                 if null is set then "none"
 	 * @return void
 	 */
-	public static function setErrorChar(string $string): void
+	public static function setErrorChar($string): void
 	{
-		self::$mb_error_char = $string;
+		if (empty($string)) {
+			$string = 'none';
+		}
+		if (!in_array($string, ['none', 'long', 'entity'])) {
+			self::$mb_error_char = \IntlChar::chr($string);
+		} else {
+			self::$mb_error_char = $string;
+		}
+		mb_substitute_character($string);
 	}

 	/**
 	 * get the current set error character
 	 *
-	 * @return string Set error character
+	 * @param  bool $return_substitute_func if set to true return the set
+	 *                                      character from the php function
+	 *                                      directly
+	 * @return string|int Set error character
 	 */
-	public static function getErrorChar(): string
+	public static function getErrorChar(bool $return_substitute_func = false)
 	{
-		return self::$mb_error_char;
+		// return mb_substitute_character();
+		if ($return_substitute_func === true) {
+			return mb_substitute_character();
+		} else {
+			return self::$mb_error_char;
+		}
 	}

 	/**
-	 * test if a string can be safely convert between encodings. mostly utf8 to shift jis
+	 * test if a string can be safely convert between encodings.
+	 * mostly utf8 to shift jis
 	 * the default compare has a possibility of failure, especially with windows
 	 * it is recommended to the following in the script which uses this method:
 	 * mb_substitute_character(0x2234);
@@ -80,13 +119,17 @@ class Encoding
 	 * if check to ISO-2022-JP-MS
 	 * set three dots (∴) as wrong character for correct convert error detect
 	 * (this char is used, because it is one of the least used ones)
+	 *
 	 * @param  string     $string        string to test
 	 * @param  string     $from_encoding encoding of string to test
 	 * @param  string     $to_encoding   target encoding
 	 * @return bool|array<string>        false if no error or array with failed characters
 	 */
-	public static function checkConvertEncoding(string $string, string $from_encoding, string $to_encoding)
-	{
+	public static function checkConvertEncoding(
+		string $string,
+		string $from_encoding,
+		string $to_encoding
+	) {
 		// convert to target encoding and convert back
 		$temp = mb_convert_encoding($string, $to_encoding, $from_encoding);
 		$compare = mb_convert_encoding($temp, $from_encoding, $to_encoding);
@@ -97,9 +140,11 @@ class Encoding
 			for ($i = 0, $iMax = mb_strlen($string, $from_encoding); $i < $iMax; $i++) {
 				$char = mb_substr($string, $i, 1, $from_encoding);
 				$r_char = mb_substr($compare, $i, 1, $from_encoding);
-				// the ord 194 is a hack to fix the IE7/IE8 bug with line break and illegal character
+				// the ord 194 is a hack to fix the IE7/IE8
+				// bug with line break and illegal character
 				if (
-					(($char != $r_char && !self::$mb_error_char) ||
+					(($char != $r_char && (!self::$mb_error_char ||
+					in_array(self::$mb_error_char, ['none', 'long', 'entity']))) ||
 					($char != $r_char && $r_char == self::$mb_error_char && self::$mb_error_char)) &&
 					ord($char) != 194
 				) {
@@ -118,6 +163,7 @@ class Encoding
 	 * if source encoding is set and auto check is true (default) a second
 	 * check is done so that the source string encoding actually matches
 	 * will be skipped if source encoding detection is ascii
+	 *
 	 * @param  string $string          string to convert
 	 * @param  string $to_encoding     target encoding
 	 * @param  string $source_encoding optional source encoding, will try to auto detect
@@ -144,7 +190,8 @@ class Encoding
 			$_source_encoding == $source_encoding
 		) {
 			// trigger check if we have override source encoding.
-			// if different (_source is all but not ascii) then trigger skip if matching
+			// if different (_source is all but not ascii) then trigger
+			// skip if matching
 		}
 		if ($source_encoding != $to_encoding) {
 			if ($source_encoding) {