Česky
Kamil Dudka

Web components

File detail

Name:DownloadCharsetDetector.class.php [Download]
Location: src > lib
Size:13.8 KB
Last modification:2013-04-23 23:18

Source code

<?php
/**
 * @file CharsetDetector.class.php
 * Definition of CharsetDetector class group.
 * @ingroup CharsetDetector
 * @author Kamil Dudka <xdudka00@gmail.com>
 * @date 2008-01-30
 */
 
/**
 * @defgroup CharsetDetector
 * At http://dudka.cz/CharsetDetector may be available abstract, documentation
 * and tutorial for this component.\ See CharsetDetector class, if you don't
 * know where to begin.
 */
 
 
/**
 * Inteligent charset detector - basic interface.
 * @ingroup CharsetDetector
 * This class makes inteligent wrapper around iconv library. It can
 * guess charset of given text using static analyse. Default behavior
 * is highly extensible - you can load configuration for different
 * charsets or different languages. And you can do it dynamically.
 * @note Design pattern @b facade.
 */
class CharsetDetector {
  const DEFAULT_TARGET_CHARSET = "UTF-8";     ///< Default target charset.\ Can be overriden on the fly.
  const DEFAULT_MAX_ANALYZED_LENGTH = 65536;  ///< Default value for maxLengthAnalyzed parameter of analyze method.
  const DEFAULT_MIN_RELEVANCE = 0.1;          ///< Default value for minRlevance property.
  /**
   * Intelligent "equivalent" of file_get_contents standard PHP function.
   * @note This is the most simple (and most dummy) way to use this class.
   * @param filename Name of file to read (and recode if needed).
   * @return Return Text of the file using @b default target charset.
   */
  public static function file_get_contents($filename) {
    return self::convert(file_get_contents($filename));
  }
  /**
   * Convert string to desired charset if detected charset is @e enough relevant.
   * If there is no charset detected (or not relevant), it just return the original input.
   * @param stringToConvert String to convert - input of this method.
   * @param targetCharset Charset to convert to. Using iconv's mark-up.
   * @return Return converted string if charset was detected, or original in other case.
   */
  public static function convert($stringToConvert, $targetCharset = self::DEFAULT_TARGET_CHARSET) {
    $instance = new self;
    $instance->setTargetCharset($targetCharset);
    $instance->analyze($stringToConvert);
    return $instance->convertIfRelevant($stringToConvert);
  }
  /**
   * @param charsetStreamAnalyzer Initialized instace of analyzer.
   * Use CharsetStreamAnalyzerFactory class to create such object, or
   * omit this parameter to use default analyzer.
   */
  public function __construct($charsetStreamAnalyzer = null) {
    $this->analyzer = (null == $charsetStreamAnalyzer)
      ? CharsetStreamAnalyzerFactory::createDefault()
      : $charsetStreamAnalyzer;
    $this->minRelevance = self::DEFAULT_MIN_RELEVANCE;
    $this->targetCharset = self::DEFAULT_TARGET_CHARSET;
  }
  /**
   * @return Return instance of CharsetStreamAnalyzer
   * currently being used by CharsetDetector.
   */
  public function getCharsetStreamAnalyzer() {
    return $this->analyzer;
  }
  /**
   * Analyze a piece of given text and try to detect charset.
   * @note This method can be called periodically.
   * @param stringToAnalyze Text to use for analyze.
   * @param maxLengthAnalyzed Maximum length of text used for analyze.
   */
  public function analyze($stringToAnalyze, $maxLengthAnalyzed = self::DEFAULT_MAX_ANALYZED_LENGTH) {
    $length = strlen($stringToAnalyze);
    if ($length > $maxLengthAnalyzed)
      // Limit analyzed length
      $length = $maxLengthAnalyzed;
 
 
    for($i=0; $i< $length; $i++) {
      $char = $stringToAnalyze[$i];
      if (0x80 <= ord($char))
        // Non-ASCII character
        $this->analyzer->putchar($char);
    }
  }
  /**
   * Convert given text to target charset if detected charset is @e enough relevant.
   * @note This method can be called periodically.
   * @param stringToConvert Text to convert - input of this method.
   * @return Return converted text to target charset, or original
   * if no relevant charset was detected.
   */
  public function convertIfRelevant($stringToConvert) {
    $weightMap = $this->analyzer->getCharsetWeightMap();
    $iterator = $weightMap->createSortedIterator();
    $item = $iterator->next();
    if (null == $item)
      return $stringToConvert;
    $relWeight = $item->getRelWeight();
    $charset = $item->getCharset();
    if ($relWeight < $this->minRelevance)
      return $stringToConvert;
    else
      return CharsetConverter::convertCharset($charset, $this->targetCharset, $stringToConvert);
  }
  /**
   * @return Return target charset.
   */
  public function getTargetCharset() {
    return $this->targetCharset;
  }
  /**
   * Set target charset.
   * @param targetCharset Target charset to set to.
   */
  public function setTargetCharset($targetCharset) {
    $this->targetCharset = $targetCharset;
  }
  /**
   * See documentation of CharsetDetector::setMinRelevance method.
   * @return Return minimal relevance.
   */
  public function getMinRelevance() {
    return $this->minRelevance;
  }
  /**
   * Set minimal relevance.\ Relevance is normalized to interval <-1, 1>.\ 
   * 1 means sure detected, -1 means sure displaced and 0 means don't know.\ 
   * @note This parameter should be always greater then zero.
   * @param minRelevance Minimal relevance to set to.
   */
  public function setMinRelevance($minRelevance) {
    $this->minRelevance = $minRelevance;
  }
  private $analyzer;
  private $minRelevance;
  private $targetCharset;
};
 
/**
 * Dummy wrapper around iconv library.
 * @ingroup CharsetDetector
 * Created in hope it can increase independency on iconv library.
 */
class CharsetConverter {
  /**
   * Dummy wrapper around standard iconv method.
   * Refer to manual of this standard method.
   */
  public static function convertCharset($sourceEncoding, $targetEncoding, $stringToConvert) {
    return iconv($sourceEncoding, $targetEncoding, $stringToConvert);
  }
}
 
/**
 * Input stream interface.
 * @ingroup CharsetDetector
 * Now supporting only one-char transfer.
 */
interface IInputStream {
  /**
   * Send character to stream.
   * @param charToPut Character to send to stream.
   */
  public function putchar($charToPut);
};
 
/**
 * Interface of extensible character weight maps.
 * @ingroup CharsetDetector
 */
interface ICharWeightMapList {
  /**
   * Add weight map for new charset or overwrite an existing one.
   * @param charsetName Name of charset to define.
   * @param charWeightMap Weight map's data. Format is @e character @e => @e weight.
   */
  public function addWeightMap($charsetName, $charWeightMap);
};
 
/**
 * This class analyzes text to detect its charset.
 * @ingroup CharsetDetector
 */
class CharsetStreamAnalyzer implements
    IInputStream,
    ICharWeightMapList
{
  public function __construct() {
    $this->charsetWeightMap = new CharsetWeightMap;
  }
  /**
   * Analyze one character.
   * @copydoc IInputStream::putchar
   */
  public function putchar($charToPut) {
    $charOrdValue = ord($charToPut);
    foreach($this->weightMapList as $charset => $weightMap) {
      if (array_key_exists($charOrdValue, $weightMap))
        $this->charsetWeightMap->addWeight($charset, $weightMap[$charOrdValue]);
      // Useful for debug
      // printf("%x - %d<br/>\n", $charOrdValue, $weightMap[$charOrdValue]);
    }
  }
  /**
   * Add weight map for new charset or overwrite an existing one.
   * @copydoc ICharWeightMapList::addWeightMap
   */
  public function addWeightMap($charsetName, $charWeightMap) {
    $this->weightMapList[$charsetName] = $charWeightMap;
    $this->charsetWeightMap->addWeight($charsetName, 0);
  }
  /**
   * This method return results of analyse.
   * @return Return CharsetWeightMap object containg results.
   */
  public function getCharsetWeightMap() {
    return clone $this->charsetWeightMap;
  }
  private $weightMapList = Array();
  private $charsetWeightMap;
};
 
/**
 * Charset analyse's data container.
 * @ingroup CharsetDetector
 */
class CharsetWeightMap {
  /**
   * Increment counter of selected charset.
   * @param charset Charset name to increment.
   * @param weight Weight of increment - can positive or negative integer.
   */
  public function addWeight($charset, $weight) {
    if (!isset($this->map[$charset]))
      // avoid a warning 'Notice: Undefined index' with log level E_NOTICE
      $this->map[$charset] = 0;
 
    $this->map[$charset] += $weight;
    if (0 < $weight)
      $this->totalWeight += $weight;
  }
  /**
   * Create desceding ordered iterator for items maintained by container.
   * @return Return initialized object, which implements
   * ICharsetWeightMapIterator interface.
   */
  public function createSortedIterator() {
    $iterator = new CharsetWeightMapIterator;
    if ($this->totalWeight > 0) {
      $sortedMap = $this->map;
      arsort($sortedMap);
      foreach($sortedMap as $charset => $weight) {
        $item = new CharsetWeightMapItem($charset, $weight, $weight/$this->totalWeight);
        // Useful for debug
        // printf("%s - %.2f %%<br/>\n", $charset, $weight/$this->totalWeight*100);
        $iterator->add($item);
      }
    }
    return $iterator;
  }
  private $map = Array();
  private $totalWeight = 0;
};
 
/**
 * Interface of charset weight map iterator.
 * @ingroup CharsetDetector
 * @note Design pattern @b iterator.
 */
interface ICharsetWeightMapIterator {
  /**
   * Return current item and then move to next.
   * @return Return current item or null if there is no current item.
   */
  public function next();
};
 
/**
 * Realization of ICharsetWeightMapIterator used in CharsetWeightMap class.
 * @ingroup CharsetDetector
 * @attention This class was @b not @b tested.
 */
class CharsetWeightMapIterator implements ICharsetWeightMapIterator {
  /**
   * @copydoc ICharWeightMapList::next
   * @attention This class was @b not @b tested.
   */
  public function next() {
    if ($this->index >= count($this->vector))
      return null;
    else
      return $this->vector[$this->index++];
  }
  /**
   * Add item to iterator. Usually called by creator.
   * @param charsetWeightMapItem Item to add.
   */
  public function add($charsetWeightMapItem) {
    $this->vector[]= $charsetWeightMapItem;
  }
  private $vector = Array();
  private $index = 0;
};
 
/**
 * Class containing data of charset weight map item.
 * @ingroup CharsetDetector
 * Objects of this class are non-mutable. It just holds data.
 */
class CharsetWeightMapItem {
  /**
   * @param charset Charset name.
   * @param absWeight Absolute weight of charset.
   * @param relWeight Relative weight of charset.\ Using interval <-1, 1>.
   */
  public function __construct($charset, $absWeight, $relWeight) {
    $this->charset = $charset;
    $this->absWeight = $absWeight;
    $this->relWeight = $relWeight;
  }
  /**
   * @return Return charset name.
   */
  public function getCharset() {
    return $this->charset;
  }
  /**
   * @return Return absolute weight of charset.
   */
  public function getAbsWeight() {
    return $this->absWeight;
  }
  /**
   * @return Return relative weight of charset.\ Using interval <-1, 1>.
   */
  public function getRelWeight() {
    return $this->relWeight;
  }
  private $charset;
  private $absWeight;
  private $relWeight;
};
 
/**
 * Factory for CharsetStreamAnalyzer objects.
 * @ingroup CharsetDetector
 */
class CharsetStreamAnalyzerFactory {
  /**
   * @return Return non-configured CharsetStreamAnalyzer object.
   * @note You need to load configuration yourself.
   */
  public static function createEmpty() {
    return new CharsetStreamAnalyzer;
  }
  /**
   * @return Return CharsetStreamAnalyzer object with default
   * configuration loaded.
   * @note You can extend its configuration yourself.
   * @note Consider this method's code while creating new extensions.
   */
  public static function createDefault() {
    $analyzer = self::createEmpty();
 
    // ISO-8859-2
    $charWeightMap = Array();
    for($i=0x80; $i<=0x9F; $i++)
      // Invalid character
      $charWeightMap[$i] = -5;
    for($i=0xA0; $i<=0xFF; $i++)
      // Default weight
      $charWeightMap[$i] = -1;
    // Positive weight
    $charWeightMap[0xA9] = $charWeightMap[0xAB] = $charWeightMap[0xAE] = 1;
    $charWeightMap[0xB9] = $charWeightMap[0xBB] = $charWeightMap[0xBE] = 1;
    $charWeightMap[0xC1] = $charWeightMap[0xC8] = $charWeightMap[0xC9] = 1;
    $charWeightMap[0xCC] = $charWeightMap[0xCD] = $charWeightMap[0xCF] = 1;
    $charWeightMap[0xD2] = $charWeightMap[0xD3] = $charWeightMap[0xD8] = 1;
    $charWeightMap[0xD9] = $charWeightMap[0xDA] = $charWeightMap[0xDD] = 1;
    $charWeightMap[0xE1] = $charWeightMap[0xE8] = $charWeightMap[0xE9] = 1;
    $charWeightMap[0xEC] = $charWeightMap[0xED] = $charWeightMap[0xEF] = 1;
    $charWeightMap[0xF2] = $charWeightMap[0xF3] = $charWeightMap[0xF8] = 1;
    $charWeightMap[0xF9] = $charWeightMap[0xFA] = $charWeightMap[0xFD] = 1;
    $analyzer->addWeightMap("ISO-8859-2", $charWeightMap);
 
    // Windows 1250
    $charWeightMap = Array();
    for($i=0x80; $i<=0xFF; $i++)
      // Default weight
      $charWeightMap[$i] = -1;
    // Invalid characters
    $charWeightMap[0x81] = $charWeightMap[0x83] = $charWeightMap[0x88] = -5;
    $charWeightMap[0x90] = $charWeightMap[0x98] = -5;
    // Positive weight
    $charWeightMap[0x8A] = $charWeightMap[0x8D] = $charWeightMap[0x8E] = 1;
    $charWeightMap[0x9A] = $charWeightMap[0x9D] = $charWeightMap[0x9E] = 1;
    $charWeightMap[0xC1] = $charWeightMap[0xC8] = $charWeightMap[0xC9] = 1;
    $charWeightMap[0xCC] = $charWeightMap[0xCD] = $charWeightMap[0xCF] = 1;
    $charWeightMap[0xD2] = $charWeightMap[0xD3] = $charWeightMap[0xD8] = 1;
    $charWeightMap[0xD9] = $charWeightMap[0xDA] = $charWeightMap[0xDD] = 1;
    $charWeightMap[0xE1] = $charWeightMap[0xE5] = $charWeightMap[0xE8] = 1;
    $charWeightMap[0xE9] = $charWeightMap[0xEC] = $charWeightMap[0xED] = 1;
    $charWeightMap[0xEF] = $charWeightMap[0xF2] = $charWeightMap[0xF3] = 1;
    $charWeightMap[0xF8] = $charWeightMap[0xFA] = $charWeightMap[0xFD] = 1;
    $analyzer->addWeightMap("CP1250", $charWeightMap);
 
    return $analyzer;
  }
};
 
?>