Файл: qa-include/qa-util-string.php
Строк: 453
<?php
/*
Question2Answer (c) Gideon Greenspan
http://www.question2answer.org/
File: qa-include/qa-util-string.php
Version: See define()s at top of qa-include/qa-base.php
Description: Some useful string-related stuff
This program is free software; you can redistribute it and/or
modify it under the terms of the GNU General Public License
as published by the Free Software Foundation; either version 2
of the License, or (at your option) any later version.
This program is distributed in the hope that it will be useful,
but WITHOUT ANY WARRANTY; without even the implied warranty of
MERCHANTABILITY or FITNESS FOR A PARTICULAR PURPOSE. See the
GNU General Public License for more details.
More about this license: http://www.question2answer.org/license.php
*/
if (!defined('QA_VERSION')) { // don't allow this page to be requested directly from browser
header('Location: ../');
exit;
}
// Functions
function qa_string_initialize()
/*
Set up some global tables to be used by other functions in this file
*/
{
if (qa_to_override(__FUNCTION__)) { $args=func_get_args(); return qa_call_override(__FUNCTION__, $args); }
global $qa_utf8punctuation, $qa_utf8removeaccents;
$qa_utf8punctuation=array( // converts UTF-8 punctuation characters to spaces (or in some cases, hyphens)
"xC2xA1" => ' ', // INVERTED EXCLAMATION MARK
"xC2xA6" => ' ', // BROKEN BAR
"xC2xAB" => ' ', // LEFT-POINTING DOUBLE ANGLE QUOTATION MARK
"xC2xB1" => ' ', // PLUS-MINUS SIGN
"xC2xBB" => ' ', // RIGHT-POINTING DOUBLE ANGLE QUOTATION MARK
"xC2xBF" => ' ', // INVERTED QUESTION MARK
"xC3x97" => ' ', // MULTIPLICATION SIGN
"xC3xB7" => ' ', // DIVISION SIGN
"xE2x80x80" => ' ', // EN QUAD
"xE2x80x81" => ' ', // EM QUAD
"xE2x80x82" => ' ', // EN SPACE
"xE2x80x83" => ' ', // EM SPACE
"xE2x80x84" => ' ', // THREE-PER-EM SPACE
"xE2x80x85" => ' ', // FOUR-PER-EM SPACE
"xE2x80x86" => ' ', // SIX-PER-EM SPACE
"xE2x80x87" => ' ', // FIGURE SPACE
"xE2x80x88" => ' ', // PUNCTUATION SPACE
"xE2x80x89" => ' ', // THIN SPACE
"xE2x80x8A" => ' ', // HAIR SPACE
"xE2x80x8B" => ' ', // ZERO WIDTH SPACE
"xE2x80x8C" => ' ', // ZERO WIDTH NON-JOINER
"xE2x80x8E" => ' ', // LEFT-TO-RIGHT MARK
"xE2x80x8F" => ' ', // RIGHT-TO-LEFT MARK
"xE2x80x90" => '-', // HYPHEN
"xE2x80x91" => '-', // NON-BREAKING HYPHEN
"xE2x80x92" => '-', // FIGURE DASH
"xE2x80x93" => '-', // EN DASH
"xE2x80x94" => '-', // EM DASH
"xE2x80x95" => '-', // HORIZONTAL BAR
"xE2x80x96" => ' ', // DOUBLE VERTICAL LINE
"xE2x80x98" => ' ', // LEFT SINGLE QUOTATION MARK
"xE2x80x99" => "'", // RIGHT SINGLE QUOTATION MARK
"xE2x80x9A" => ' ', // SINGLE LOW-9 QUOTATION MARK
"xE2x80x9B" => ' ', // SINGLE HIGH-REVERSED-9 QUOTATION MARK
"xE2x80x9C" => ' ', // LEFT DOUBLE QUOTATION MARK
"xE2x80x9D" => ' ', // RIGHT DOUBLE QUOTATION MARK
"xE2x80x9E" => ' ', // DOUBLE LOW-9 QUOTATION MARK
"xE2x80x9F" => ' ', // DOUBLE HIGH-REVERSED-9 QUOTATION MARK
"xE2x80xA2" => ' ', // BULLET
"xE2x80xA4" => ' ', // ONE DOT LEADER
"xE2x80xA5" => ' ', // TWO DOT LEADER
"xE2x80xA6" => ' ', // HORIZONTAL ELLIPSIS
"xE2x80xB9" => ' ', // SINGLE LEFT-POINTING ANGLE QUOTATION MARK
"xE2x80xBA" => ' ', // SINGLE RIGHT-POINTING ANGLE QUOTATION MARK
"xE2x80xBC" => ' ', // DOUBLE EXCLAMATION MARK
"xE2x80xBD" => ' ', // INTERROBANG
"xE2x81x87" => ' ', // DOUBLE QUESTION MARK
"xE2x81x88" => ' ', // QUESTION EXCLAMATION MARK
"xE2x81x89" => ' ', // EXCLAMATION QUESTION MARK
"xE3x80x80" => ' ', // IDEOGRAPHIC SPACE
"xE3x80x81" => ' ', // IDEOGRAPHIC COMMA
"xE3x80x82" => ' ', // IDEOGRAPHIC FULL STOP
);
$qa_utf8removeaccents=array( // convert UTF-8 accented characters to basic Roman characters
"xC3x80" => 'A', // LATIN CAPITAL LETTER A WITH GRAVE
"xC3x81" => 'A', // LATIN CAPITAL LETTER A WITH ACUTE
"xC3x82" => 'A', // LATIN CAPITAL LETTER A WITH CIRCUMFLEX
"xC3x83" => 'A', // LATIN CAPITAL LETTER A WITH TILDE
"xC3x84" => 'A', // LATIN CAPITAL LETTER A WITH DIAERESIS
"xC3x85" => 'A', // LATIN CAPITAL LETTER A WITH RING ABOVE
"xC3x86" => 'AE', // LATIN CAPITAL LETTER AE
"xC3x87" => 'C', // LATIN CAPITAL LETTER C WITH CEDILLA
"xC3x88" => 'E', // LATIN CAPITAL LETTER E WITH GRAVE
"xC3x89" => 'E', // LATIN CAPITAL LETTER E WITH ACUTE
"xC3x8A" => 'E', // LATIN CAPITAL LETTER E WITH CIRCUMFLEX
"xC3x8B" => 'E', // LATIN CAPITAL LETTER E WITH DIAERESIS
"xC3x8C" => 'I', // LATIN CAPITAL LETTER I WITH GRAVE
"xC3x8D" => 'I', // LATIN CAPITAL LETTER I WITH ACUTE
"xC3x8E" => 'I', // LATIN CAPITAL LETTER I WITH CIRCUMFLEX
"xC3x8F" => 'I', // LATIN CAPITAL LETTER I WITH DIAERESIS
"xC3x91" => 'N', // LATIN CAPITAL LETTER N WITH TILDE
"xC3x92" => 'O', // LATIN CAPITAL LETTER O WITH GRAVE
"xC3x93" => 'O', // LATIN CAPITAL LETTER O WITH ACUTE
"xC3x94" => 'O', // LATIN CAPITAL LETTER O WITH CIRCUMFLEX
"xC3x95" => 'O', // LATIN CAPITAL LETTER O WITH TILDE
"xC3x96" => 'O', // LATIN CAPITAL LETTER O WITH DIAERESIS
"xC3x98" => 'O', // LATIN CAPITAL LETTER O WITH STROKE
"xC3x99" => 'U', // LATIN CAPITAL LETTER U WITH GRAVE
"xC3x9A" => 'U', // LATIN CAPITAL LETTER U WITH ACUTE
"xC3x9B" => 'U', // LATIN CAPITAL LETTER U WITH CIRCUMFLEX
"xC3x9C" => 'U', // LATIN CAPITAL LETTER U WITH DIAERESIS
"xC3x9D" => 'Y', // LATIN CAPITAL LETTER Y WITH ACUTE
"xC3x9F" => 'ss', // LATIN SMALL LETTER SHARP S
"xC3xA0" => 'a', // LATIN SMALL LETTER A WITH GRAVE
"xC3xA1" => 'a', // LATIN SMALL LETTER A WITH ACUTE
"xC3xA2" => 'a', // LATIN SMALL LETTER A WITH CIRCUMFLEX
"xC3xA3" => 'a', // LATIN SMALL LETTER A WITH TILDE
"xC3xA4" => 'a', // LATIN SMALL LETTER A WITH DIAERESIS
"xC3xA5" => 'a', // LATIN SMALL LETTER A WITH RING ABOVE
"xC3xA6" => 'ae', // LATIN SMALL LETTER AE
"xC3xA7" => 'c', // LATIN SMALL LETTER C WITH CEDILLA
"xC3xA8" => 'e', // LATIN SMALL LETTER E WITH GRAVE
"xC3xA9" => 'e', // LATIN SMALL LETTER E WITH ACUTE
"xC3xAA" => 'e', // LATIN SMALL LETTER E WITH CIRCUMFLEX
"xC3xAB" => 'e', // LATIN SMALL LETTER E WITH DIAERESIS
"xC3xAC" => 'i', // LATIN SMALL LETTER I WITH GRAVE
"xC3xAD" => 'i', // LATIN SMALL LETTER I WITH ACUTE
"xC3xAE" => 'i', // LATIN SMALL LETTER I WITH CIRCUMFLEX
"xC3xAF" => 'i', // LATIN SMALL LETTER I WITH DIAERESIS
"xC3xB1" => 'n', // LATIN SMALL LETTER N WITH TILDE
"xC3xB2" => 'o', // LATIN SMALL LETTER O WITH GRAVE
"xC3xB3" => 'o', // LATIN SMALL LETTER O WITH ACUTE
"xC3xB4" => 'o', // LATIN SMALL LETTER O WITH CIRCUMFLEX
"xC3xB5" => 'o', // LATIN SMALL LETTER O WITH TILDE
"xC3xB6" => 'o', // LATIN SMALL LETTER O WITH DIAERESIS
"xC3xB8" => 'o', // LATIN SMALL LETTER O WITH STROKE
"xC3xB9" => 'u', // LATIN SMALL LETTER U WITH GRAVE
"xC3xBA" => 'u', // LATIN SMALL LETTER U WITH ACUTE
"xC3xBB" => 'u', // LATIN SMALL LETTER U WITH CIRCUMFLEX
"xC3xBC" => 'u', // LATIN SMALL LETTER U WITH DIAERESIS
"xC3xBD" => 'y', // LATIN SMALL LETTER Y WITH ACUTE
"xC3xBF" => 'y', // LATIN SMALL LETTER Y WITH DIAERESIS
"xC4x80" => 'A', // LATIN CAPITAL LETTER A WITH MACRON
"xC4x81" => 'a', // LATIN SMALL LETTER A WITH MACRON
"xC4x82" => 'A', // LATIN CAPITAL LETTER A WITH BREVE
"xC4x83" => 'a', // LATIN SMALL LETTER A WITH BREVE
"xC4x84" => 'A', // LATIN CAPITAL LETTER A WITH OGONEK
"xC4x85" => 'a', // LATIN SMALL LETTER A WITH OGONEK
"xC4x86" => 'C', // LATIN CAPITAL LETTER C WITH ACUTE
"xC4x87" => 'c', // LATIN SMALL LETTER C WITH ACUTE
"xC4x88" => 'C', // LATIN CAPITAL LETTER C WITH CIRCUMFLEX
"xC4x89" => 'c', // LATIN SMALL LETTER C WITH CIRCUMFLEX
"xC4x8A" => 'C', // LATIN CAPITAL LETTER C WITH DOT ABOVE
"xC4x8B" => 'c', // LATIN SMALL LETTER C WITH DOT ABOVE
"xC4x8C" => 'C', // LATIN CAPITAL LETTER C WITH CARON
"xC4x8D" => 'c', // LATIN SMALL LETTER C WITH CARON
"xC4x8E" => 'D', // LATIN CAPITAL LETTER D WITH CARON
"xC4x8F" => 'd', // LATIN SMALL LETTER D WITH CARON
"xC4x90" => 'D', // LATIN CAPITAL LETTER D WITH STROKE
"xC4x91" => 'd', // LATIN SMALL LETTER D WITH STROKE
"xC4x92" => 'E', // LATIN CAPITAL LETTER E WITH MACRON
"xC4x93" => 'e', // LATIN SMALL LETTER E WITH MACRON
"xC4x94" => 'E', // LATIN CAPITAL LETTER E WITH BREVE
"xC4x95" => 'e', // LATIN SMALL LETTER E WITH BREVE
"xC4x96" => 'E', // LATIN CAPITAL LETTER E WITH DOT ABOVE
"xC4x97" => 'e', // LATIN SMALL LETTER E WITH DOT ABOVE
"xC4x98" => 'E', // LATIN CAPITAL LETTER E WITH OGONEK
"xC4x99" => 'e', // LATIN SMALL LETTER E WITH OGONEK
"xC4x9A" => 'E', // LATIN CAPITAL LETTER E WITH CARON
"xC4x9B" => 'e', // LATIN SMALL LETTER E WITH CARON
"xC4x9C" => 'G', // LATIN CAPITAL LETTER G WITH CIRCUMFLEX
"xC4x9D" => 'g', // LATIN SMALL LETTER G WITH CIRCUMFLEX
"xC4x9E" => 'G', // LATIN CAPITAL LETTER G WITH BREVE
"xC4x9F" => 'g', // LATIN SMALL LETTER G WITH BREVE
"xC4xA0" => 'G', // LATIN CAPITAL LETTER G WITH DOT ABOVE
"xC4xA1" => 'g', // LATIN SMALL LETTER G WITH DOT ABOVE
"xC4xA2" => 'G', // LATIN CAPITAL LETTER G WITH CEDILLA
"xC4xA3" => 'g', // LATIN SMALL LETTER G WITH CEDILLA
"xC4xA4" => 'H', // LATIN CAPITAL LETTER H WITH CIRCUMFLEX
"xC4xA5" => 'h', // LATIN SMALL LETTER H WITH CIRCUMFLEX
"xC4xA6" => 'H', // LATIN CAPITAL LETTER H WITH STROKE
"xC4xA7" => 'h', // LATIN SMALL LETTER H WITH STROKE
"xC4xA8" => 'I', // LATIN CAPITAL LETTER I WITH TILDE
"xC4xA9" => 'i', // LATIN SMALL LETTER I WITH TILDE
"xC4xAA" => 'I', // LATIN CAPITAL LETTER I WITH MACRON
"xC4xAB" => 'i', // LATIN SMALL LETTER I WITH MACRON
"xC4xAC" => 'I', // LATIN CAPITAL LETTER I WITH BREVE
"xC4xAD" => 'i', // LATIN SMALL LETTER I WITH BREVE
"xC4xAE" => 'I', // LATIN CAPITAL LETTER I WITH OGONEK
"xC4xAF" => 'i', // LATIN SMALL LETTER I WITH OGONEK
"xC4xB0" => 'I', // LATIN CAPITAL LETTER I WITH DOT ABOVE
"xC4xB1" => 'i', // LATIN SMALL LETTER DOTLESS I
"xC4xB2" => 'IJ', // LATIN CAPITAL LIGATURE IJ
"xC4xB3" => 'ij', // LATIN SMALL LIGATURE IJ
"xC4xB4" => 'j', // LATIN CAPITAL LETTER J WITH CIRCUMFLEX
"xC4xB5" => 'j', // LATIN SMALL LETTER J WITH CIRCUMFLEX
"xC4xB6" => 'K', // LATIN CAPITAL LETTER K WITH CEDILLA
"xC4xB7" => 'k', // LATIN SMALL LETTER K WITH CEDILLA
"xC4xB9" => 'L', // LATIN CAPITAL LETTER L WITH ACUTE
"xC4xBA" => 'l', // LATIN SMALL LETTER L WITH ACUTE
"xC4xBB" => 'L', // LATIN CAPITAL LETTER L WITH CEDILLA
"xC4xBC" => 'l', // LATIN SMALL LETTER L WITH CEDILLA
"xC4xBD" => 'L', // LATIN CAPITAL LETTER L WITH CARON
"xC4xBE" => 'l', // LATIN SMALL LETTER L WITH CARON
"xC4xBF" => 'L', // LATIN CAPITAL LETTER L WITH MIDDLE DOT
"xC5x80" => 'l', // LATIN SMALL LETTER L WITH MIDDLE DOT
"xC5x81" => 'L', // LATIN CAPITAL LETTER L WITH STROKE
"xC5x82" => 'l', // LATIN SMALL LETTER L WITH STROKE
"xC5x83" => 'N', // LATIN CAPITAL LETTER N WITH ACUTE
"xC5x84" => 'n', // LATIN SMALL LETTER N WITH ACUTE
"xC5x85" => 'N', // LATIN CAPITAL LETTER N WITH CEDILLA
"xC5x86" => 'n', // LATIN SMALL LETTER N WITH CEDILLA
"xC5x87" => 'N', // LATIN CAPITAL LETTER N WITH CARON
"xC5x88" => 'n', // LATIN SMALL LETTER N WITH CARON
"xC5x8C" => 'O', // LATIN CAPITAL LETTER O WITH MACRON
"xC5x8D" => 'o', // LATIN SMALL LETTER O WITH MACRON
"xC5x8E" => 'O', // LATIN CAPITAL LETTER O WITH BREVE
"xC5x8F" => 'o', // LATIN SMALL LETTER O WITH BREVE
"xC5x90" => 'O', // LATIN CAPITAL LETTER O WITH DOUBLE ACUTE
"xC5x91" => 'o', // LATIN SMALL LETTER O WITH DOUBLE ACUTE
"xC5x92" => 'OE', // LATIN CAPITAL LIGATURE OE
"xC5x93" => 'oe', // LATIN SMALL LIGATURE OE
"xC5x94" => 'R', // LATIN CAPITAL LETTER R WITH ACUTE
"xC5x95" => 'r', // LATIN SMALL LETTER R WITH ACUTE
"xC5x96" => 'R', // LATIN CAPITAL LETTER R WITH CEDILLA
"xC5x97" => 'r', // LATIN SMALL LETTER R WITH CEDILLA
"xC5x98" => 'R', // LATIN CAPITAL LETTER R WITH CARON
"xC5x99" => 'r', // LATIN SMALL LETTER R WITH CARON
"xC5x9A" => 'S', // LATIN CAPITAL LETTER S WITH ACUTE
"xC5x9B" => 's', // LATIN SMALL LETTER S WITH ACUTE
"xC5x9C" => 'S', // LATIN CAPITAL LETTER S WITH CIRCUMFLEX
"xC5x9D" => 's', // LATIN SMALL LETTER S WITH CIRCUMFLEX
"xC5x9E" => 'S', // LATIN CAPITAL LETTER S WITH CEDILLA
"xC5x9F" => 's', // LATIN SMALL LETTER S WITH CEDILLA
"xC5xA0" => 'S', // LATIN CAPITAL LETTER S WITH CARON
"xC5xA1" => 's', // LATIN SMALL LETTER S WITH CARON
"xC5xA2" => 'T', // LATIN CAPITAL LETTER T WITH CEDILLA
"xC5xA3" => 't', // LATIN SMALL LETTER T WITH CEDILLA
"xC5xA4" => 'T', // LATIN CAPITAL LETTER T WITH CARON
"xC5xA5" => 't', // LATIN SMALL LETTER T WITH CARON
"xC5xA6" => 'T', // LATIN CAPITAL LETTER T WITH STROKE
"xC5xA7" => 't', // LATIN SMALL LETTER T WITH STROKE
"xC5xA8" => 'U', // LATIN CAPITAL LETTER U WITH TILDE
"xC5xA9" => 'u', // LATIN SMALL LETTER U WITH TILDE
"xC5xAA" => 'U', // LATIN CAPITAL LETTER U WITH MACRON
"xC5xAB" => 'u', // LATIN SMALL LETTER U WITH MACRON
"xC5xAC" => 'U', // LATIN CAPITAL LETTER U WITH BREVE
"xC5xAD" => 'u', // LATIN SMALL LETTER U WITH BREVE
"xC5xAE" => 'U', // LATIN CAPITAL LETTER U WITH RING ABOVE
"xC5xAF" => 'u', // LATIN SMALL LETTER U WITH RING ABOVE
"xC5xB0" => 'U', // LATIN CAPITAL LETTER U WITH DOUBLE ACUTE
"xC5xB1" => 'u', // LATIN SMALL LETTER U WITH DOUBLE ACUTE
"xC5xB2" => 'U', // LATIN CAPITAL LETTER U WITH OGONEK
"xC5xB3" => 'u', // LATIN SMALL LETTER U WITH OGONEK
"xC5xB4" => 'W', // LATIN CAPITAL LETTER W WITH CIRCUMFLEX
"xC5xB5" => 'w', // LATIN SMALL LETTER W WITH CIRCUMFLEX
"xC5xB6" => 'Y', // LATIN CAPITAL LETTER Y WITH CIRCUMFLEX
"xC5xB7" => 'y', // LATIN SMALL LETTER Y WITH CIRCUMFLEX
"xC5xB8" => 'Y', // LATIN CAPITAL LETTER Y WITH DIAERESIS
"xC5xB9" => 'Z', // LATIN CAPITAL LETTER Z WITH ACUTE
"xC5xBA" => 'z', // LATIN SMALL LETTER Z WITH ACUTE
"xC5xBB" => 'Z', // LATIN CAPITAL LETTER Z WITH DOT ABOVE
"xC5xBC" => 'z', // LATIN SMALL LETTER Z WITH DOT ABOVE
"xC5xBD" => 'Z', // LATIN CAPITAL LETTER Z WITH CARON
"xC5xBE" => 'z', // LATIN SMALL LETTER Z WITH CARON
"xC6x80" => 'b', // LATIN SMALL LETTER B WITH STROKE
"xC6x81" => 'B', // LATIN CAPITAL LETTER B WITH HOOK
"xC6x82" => 'B', // LATIN CAPITAL LETTER B WITH TOPBAR
"xC6x83" => 'b', // LATIN SMALL LETTER B WITH TOPBAR
"xC6x87" => 'C', // LATIN CAPITAL LETTER C WITH HOOK
"xC6x88" => 'c', // LATIN SMALL LETTER C WITH HOOK
"xC6x89" => 'D', // LATIN CAPITAL LETTER AFRICAN D
"xC6x8A" => 'D', // LATIN CAPITAL LETTER D WITH HOOK
"xC6x8B" => 'D', // LATIN CAPITAL LETTER D WITH TOPBAR
"xC6x8C" => 'd', // LATIN SMALL LETTER D WITH TOPBAR
"xC6x91" => 'F', // LATIN CAPITAL LETTER F WITH HOOK
"xC6x92" => 'f', // LATIN SMALL LETTER F WITH HOOK
"xC6x93" => 'G', // LATIN CAPITAL LETTER G WITH HOOKU+0195
"xC6x97" => 'I', // LATIN CAPITAL LETTER I WITH STROKE
"xC6x98" => 'K', // LATIN CAPITAL LETTER K WITH HOOK
"xC6x99" => 'k', // LATIN SMALL LETTER K WITH HOOK
"xC6x9A" => 'l', // LATIN SMALL LETTER L WITH BAR
"xC6x9D" => 'N', // LATIN CAPITAL LETTER N WITH LEFT HOOK
"xC6x9E" => 'n', // LATIN SMALL LETTER N WITH LONG RIGHT LEG
"xC6x9F" => 'O', // LATIN CAPITAL LETTER O WITH MIDDLE TILDE
"xC6xA0" => 'O', // LATIN CAPITAL LETTER O WITH HORN
"xC6xA1" => 'o', // LATIN SMALL LETTER O WITH HORN
"xC6xA2" => 'OI', // LATIN CAPITAL LETTER OI
"xC6xA3" => 'oi', // LATIN SMALL LETTER OI
"xC6xA4" => 'P', // LATIN CAPITAL LETTER P WITH HOOK
"xC6xA5" => 'p', // LATIN SMALL LETTER P WITH HOOK
"xC6xAB" => 't', // LATIN SMALL LETTER T WITH PALATAL HOOK
"xC6xAC" => 'T', // LATIN CAPITAL LETTER T WITH HOOK
"xC6xAD" => 't', // LATIN SMALL LETTER T WITH HOOK
"xC6xAE" => 'T', // LATIN CAPITAL LETTER T WITH RETROFLEX HOOK
"xC6xAF" => 'U', // LATIN CAPITAL LETTER U WITH HORN
"xC6xB0" => 'u', // LATIN SMALL LETTER U WITH HORN
"xC6xB2" => 'V', // LATIN CAPITAL LETTER V WITH HOOK
"xC6xB3" => 'Y', // LATIN CAPITAL LETTER Y WITH HOOK
"xC6xB4" => 'y', // LATIN SMALL LETTER Y WITH HOOK
"xC6xB5" => 'Z', // LATIN CAPITAL LETTER Z WITH STROKE
"xC6xB6" => 'z', // LATIN SMALL LETTER Z WITH STROKE
"xC7x8D" => 'A', // LATIN CAPITAL LETTER A WITH CARON
"xC7x8E" => 'a', // LATIN SMALL LETTER A WITH CARON
"xC7x8F" => 'I', // LATIN CAPITAL LETTER I WITH CARON
"xC7x90" => 'i', // LATIN SMALL LETTER I WITH CARON
"xC7x91" => 'O', // LATIN CAPITAL LETTER O WITH CARON
"xC7x92" => 'o', // LATIN SMALL LETTER O WITH CARON
"xC7x93" => 'U', // LATIN CAPITAL LETTER U WITH CARON
"xC7x94" => 'u', // LATIN SMALL LETTER U WITH CARON
"xC7x95" => 'U', // LATIN CAPITAL LETTER U WITH DIAERESIS AND MACRON
"xC7x96" => 'u', // LATIN SMALL LETTER U WITH DIAERESIS AND MACRON
"xC7x97" => 'U', // LATIN CAPITAL LETTER U WITH DIAERESIS AND ACUTE
"xC7x98" => 'u', // LATIN SMALL LETTER U WITH DIAERESIS AND ACUTE
"xC7x99" => 'U', // LATIN CAPITAL LETTER U WITH DIAERESIS AND CARON
"xC7x9A" => 'u', // LATIN SMALL LETTER U WITH DIAERESIS AND CARON
"xC7x9B" => 'U', // LATIN CAPITAL LETTER U WITH DIAERESIS AND GRAVE
"xC7x9C" => 'u', // LATIN SMALL LETTER U WITH DIAERESIS AND GRAVE
"xC7x9E" => 'A', // LATIN CAPITAL LETTER A WITH DIAERESIS AND MACRON
"xC7x9F" => 'a', // LATIN SMALL LETTER A WITH DIAERESIS AND MACRON
"xC7xA0" => 'A', // LATIN CAPITAL LETTER A WITH DOT ABOVE AND MACRON
"xC7xA1" => 'a', // LATIN SMALL LETTER A WITH DOT ABOVE AND MACRON
"xC7xA2" => 'AE', // LATIN CAPITAL LETTER AE WITH MACRON
"xC7xA3" => 'ae', // LATIN SMALL LETTER AE WITH MACRON
"xC7xA4" => 'G', // LATIN CAPITAL LETTER G WITH STROKE
"xC7xA5" => 'g', // LATIN SMALL LETTER G WITH STROKE
"xC7xA6" => 'G', // LATIN CAPITAL LETTER G WITH CARON
"xC7xA7" => 'g', // LATIN SMALL LETTER G WITH CARON
"xC7xA8" => 'K', // LATIN CAPITAL LETTER K WITH CARON
"xC7xA9" => 'k', // LATIN SMALL LETTER K WITH CARON
"xC7xAA" => 'O', // LATIN CAPITAL LETTER O WITH OGONEK
"xC7xAB" => 'o', // LATIN SMALL LETTER O WITH OGONEK
"xC7xAC" => 'O', // LATIN CAPITAL LETTER O WITH OGONEK AND MACRON
"xC7xAD" => 'o', // LATIN SMALL LETTER O WITH OGONEK AND MACRON
"xC7xB0" => 'j', // LATIN SMALL LETTER J WITH CARON
"xC7xB4" => 'G', // LATIN CAPITAL LETTER G WITH ACUTE
"xC7xB5" => 'g', // LATIN SMALL LETTER G WITH ACUTE
"xC7xB8" => 'N', // LATIN CAPITAL LETTER N WITH GRAVE
"xC7xB9" => 'n', // LATIN SMALL LETTER N WITH GRAVE
"xC7xBA" => 'A', // LATIN CAPITAL LETTER A WITH RING ABOVE AND ACUTE
"xC7xBB" => 'a', // LATIN SMALL LETTER A WITH RING ABOVE AND ACUTE
"xC7xBC" => 'AE', // LATIN CAPITAL LETTER AE WITH ACUTE
"xC7xBD" => 'ae', // LATIN SMALL LETTER AE WITH ACUTE
"xC7xBE" => 'O', // LATIN CAPITAL LETTER O WITH STROKE AND ACUTE
"xC7xBF" => 'o', // LATIN SMALL LETTER O WITH STROKE AND ACUTE
"xC8x80" => 'A', // LATIN CAPITAL LETTER A WITH DOUBLE GRAVE
"xC8x81" => 'a', // LATIN SMALL LETTER A WITH DOUBLE GRAVE
"xC8x82" => 'A', // LATIN CAPITAL LETTER A WITH INVERTED BREVE
"xC8x83" => 'a', // LATIN SMALL LETTER A WITH INVERTED BREVE
"xC8x84" => 'E', // LATIN CAPITAL LETTER E WITH DOUBLE GRAVE
"xC8x85" => 'e', // LATIN SMALL LETTER E WITH DOUBLE GRAVE
"xC8x86" => 'E', // LATIN CAPITAL LETTER E WITH INVERTED BREVE
"xC8x87" => 'e', // LATIN SMALL LETTER E WITH INVERTED BREVE
"xC8x88" => 'I', // LATIN CAPITAL LETTER I WITH DOUBLE GRAVE
"xC8x89" => 'i', // LATIN SMALL LETTER I WITH DOUBLE GRAVE
"xC8x8A" => 'I', // LATIN CAPITAL LETTER I WITH INVERTED BREVE
"xC8x8B" => 'i', // LATIN SMALL LETTER I WITH INVERTED BREVE
"xC8x8C" => 'O', // LATIN CAPITAL LETTER O WITH DOUBLE GRAVE
"xC8x8D" => 'o', // LATIN SMALL LETTER O WITH DOUBLE GRAVE
"xC8x8E" => 'O', // LATIN CAPITAL LETTER O WITH INVERTED BREVE
"xC8x8F" => 'o', // LATIN SMALL LETTER O WITH INVERTED BREVE
"xC8x90" => 'R', // LATIN CAPITAL LETTER R WITH DOUBLE GRAVE
"xC8x91" => 'r', // LATIN SMALL LETTER R WITH DOUBLE GRAVE
"xC8x92" => 'R', // LATIN CAPITAL LETTER R WITH INVERTED BREVE
"xC8x93" => 'r', // LATIN SMALL LETTER R WITH INVERTED BREVE
"xC8x94" => 'U', // LATIN CAPITAL LETTER U WITH DOUBLE GRAVE
"xC8x95" => 'u', // LATIN SMALL LETTER U WITH DOUBLE GRAVE
"xC8x96" => 'U', // LATIN CAPITAL LETTER U WITH INVERTED BREVE
"xC8x97" => 'u', // LATIN SMALL LETTER U WITH INVERTED BREVE
"xC8x98" => 'S', // LATIN CAPITAL LETTER S WITH COMMA BELOW
"xC8x99" => 's', // LATIN SMALL LETTER S WITH COMMA BELOW
"xC8x9A" => 'T', // LATIN CAPITAL LETTER T WITH COMMA BELOW
"xC8x9B" => 't', // LATIN SMALL LETTER T WITH COMMA BELOW
"xC8x9E" => 'H', // LATIN CAPITAL LETTER H WITH CARON
"xC8x9F" => 'h', // LATIN SMALL LETTER H WITH CARON
"xC8xA0" => 'n', // LATIN CAPITAL LETTER N WITH LONG RIGHT LEG
"xC8xA1" => 'd', // LATIN SMALL LETTER D WITH CURL
"xC8xA4" => 'Z', // LATIN CAPITAL LETTER Z WITH HOOK
"xC8xA5" => 'z', // LATIN SMALL LETTER Z WITH HOOK
"xC8xA6" => 'A', // LATIN CAPITAL LETTER A WITH DOT ABOVE
"xC8xA7" => 'a', // LATIN SMALL LETTER A WITH DOT ABOVE
"xC8xA8" => 'E', // LATIN CAPITAL LETTER E WITH CEDILLA
"xC8xA9" => 'e', // LATIN SMALL LETTER E WITH CEDILLA
"xC8xAA" => 'O', // LATIN CAPITAL LETTER O WITH DIAERESIS AND MACRON
"xC8xAB" => 'o', // LATIN SMALL LETTER O WITH DIAERESIS AND MACRON
"xC8xAC" => 'O', // LATIN CAPITAL LETTER O WITH TILDE AND MACRON
"xC8xAD" => 'o', // LATIN SMALL LETTER O WITH TILDE AND MACRON
"xC8xAE" => 'O', // LATIN CAPITAL LETTER O WITH DOT ABOVE
"xC8xAF" => 'o', // LATIN SMALL LETTER O WITH DOT ABOVE
"xC8xB0" => 'O', // LATIN CAPITAL LETTER O WITH DOT ABOVE AND MACRON
"xC8xB1" => 'o', // LATIN SMALL LETTER O WITH DOT ABOVE AND MACRON
"xC8xB2" => 'Y', // LATIN CAPITAL LETTER Y WITH MACRON
"xC8xB3" => 'y', // LATIN SMALL LETTER Y WITH MACRON
"xC8xB4" => 'l', // LATIN SMALL LETTER L WITH CURL
"xC8xB5" => 'n', // LATIN SMALL LETTER N WITH CURL
"xC8xB6" => 't', // LATIN SMALL LETTER T WITH CURL
"xC8xB7" => 'j', // LATIN SMALL LETTER DOTLESS J
);
}
function qa_string_to_words($string, $tolowercase=true, $delimiters=false, $splitideographs=true, $splithyphens=true)
/*
Return the UTF-8 input string converted into an array of words, changed $tolowercase (or not).
Set $delimiters to true to keep the delimiters after each word and tweak what we used for word
splitting with $splitideographs and $splithyphens.
*/
{
if (qa_to_override(__FUNCTION__)) { $args=func_get_args(); return qa_call_override(__FUNCTION__, $args); }
global $qa_utf8punctuation;
if ($tolowercase)
$string=qa_strtolower($string);
$string=strtr($string, $qa_utf8punctuation);
$separator=QA_PREG_INDEX_WORD_SEPARATOR;
if ($splithyphens)
$separator.='|-';
if ($delimiters) {
if ($splitideographs)
$separator.='|'.QA_PREG_CJK_IDEOGRAPHS_UTF8;
} else {
$string=preg_replace("/(S)'(S)/", '12', $string); // remove apostrophes in words
if ($splitideographs) // put spaces around CJK ideographs so they're treated as separate words
$string=preg_replace('/'.QA_PREG_CJK_IDEOGRAPHS_UTF8.'/', '