<?php
  /**
  * Transana RTF transcription to srt conversion.
  * 
  * Example:
  *   require_once("TransRtf2Srt.php");
  *   $srt=new TransRtf2Srt("transcription.rtf");
  *   echo $srt->getSrt();
  *
  * Bold, italic and underline are currently removed
  * The transcription must begin with a timecode
  *
  * @author Dorian Soru <doriansoru@gmail.com>
  * @version 0.91
  * @copyright 2010 Dorian Soru
  * @license GPL 2 http://www.gnu.org/licenses/gpl-2.0.html
  */ 
class TransRtf2Srt {
  private $trans_timecode_code='a4';
  private $trans_timecode;
  private $reg_rtf_trans_prolog;
  private $reg_rtf_special_chars='/\\\\(?<char>((\')([0-9a-f]{2}))|((u)([0-9]{4})))/im';
  private $CHAR_TYPE_HEX="'";
  private $CHAR_TYPE_UNICODE="u";
  private $arr_reg_format;
  private $reg_timecode_parentheses="/^\(\d+:\d+:\d+\.\d+\)/m";
  private $reg_thousandths='/^<\d+>/m'; 
  private $nextframe_delay=10;
  private $timecode_thousandths_digits=3;
  private $rtf;
  private $srt;
  
  private function roundToDigits($digits, $what) {
    if (strlen($what) < $digits) {
      for ($i=0; $i<=$digits - strlen($what); $i++) {
        $what="0" . $what;
      }  
    } elseif (strlen($what) > $digits) {
      $what=round($what, $digits);
    }
    return $what;
  }
  
  private function getSrtTime($timecode, $digits=2) {
    $timecode=$this->roundToDigits(3, $timecode);
    $thousandths=$this->roundToDigits(3, round(substr($timecode, -3, 3), 2));
    $rest_in_seconds=floor($timecode / 1000);
    if ($rest_in_seconds > 0) {
      $minutes=floor($rest_in_seconds / 60);
      $seconds=($rest_in_seconds % 60);
    } else {
      $seconds="00";
      $rest=0;
    }

    if ($minutes > 0) {
      $hours=floor($minutes / 60);
      $minutes=($minutes % 60);
    } else {
      $minutes="00";
      $rest=0;
    }
    
    $hours=$this->roundToDigits($digits, $hours);
    $minutes=$this->roundToDigits($digits, $minutes);
    $seconds=$this->roundToDigits($digits, $seconds);

    return array("hours"=>$hours, "minutes"=>$minutes, "seconds"=>$seconds, "thousandths"=>$thousandths); 
  }
  
  private function rtf2srt() {
    //Deletes prolog
    $this->srt=preg_replace($this->reg_rtf_trans_prolog, '', $this->rtf);
    //Converts special chars
    preg_match_all($this->reg_rtf_special_chars, $this->srt, $arr_special_chars);
    foreach ($arr_special_chars['char'] as $char) {
      $char_type=substr($char, 0, 1);
      $char_value=substr($char, 1);
      if ($char_type==$this->CHAR_TYPE_HEX) {
        $char_content=chr(hexdec($char_value));
      } elseif ($char_type==$CHAR_TYPE_UNICODE) {
        $char_html_value='&#' . $char_value . ';';
        $char_content=mb_convert_encoding($char_html_value, 'UTF-8', 'HTML-ENTITIES');
      }
      if ($char_value != $this->trans_timecode_code) {
        $this->srt=str_replace('\\' . $char, $char_content, $this->srt);
      } else {
        $this->srt=str_replace('\\' . $char, '', $this->srt);
      }
    }
    //Replaces any commands
    $this->srt=preg_replace('/' . $this->reg_commands . '/ism', '', $this->srt);
    //Bold, italics, underlined
    $this->srt=preg_replace($this->arr_reg_format[0], $this->arr_reg_format[1], $this->srt);
    //Strip tags
    $this->srt = preg_replace('/\\\\\\\\[a-z0-9]+/', '', $this->srt);
    $this->srt = str_replace('}{', '', $this->srt);
    //Final cleanings...
    $this->srt = preg_replace('/ {2,}/', ' ', $this->srt);
    $this->srt = str_replace("\\line", "\n", $this->srt);
    $this->srt = str_replace("}\n\\par}\n", "", $this->srt);
    //Eventually deletes timecodes enclosed in parentheses
    preg_replace($this->reg_timecode_parentheses, '', $this->srt);
    //Detects timecodes
    preg_match_all($this->reg_thousandths, $this->srt, $arr_timecodes);
    $j=count($arr_timecodes[0]);
    for ($i=0; $i<$j; $i++) {
      $timecode=substr($arr_timecodes[0][$i], 1, -1);
      $arr_init_time=$this->getSrtTime($timecode); 
      $init_h=$arr_init_time['hours'];
      $init_m=$arr_init_time['minutes'];
      $init_s=$arr_init_time['seconds'];
      $init_th=$arr_init_time['thousandths'];
      if ($i+1 < $j) {
        $next_timecode=substr($arr_timecodes[0][$i+1], 1, -1);
        $arr_end_time=$this->getSrtTime($next_timecode);
        $end_h=$arr_end_time['hours'];
        $end_m=$arr_end_time['minutes'];
        $end_s=$arr_end_time['seconds'];     
        $end_th=$arr_end_time['thousandths'] - $this->nextframe_delay;
      } else {
        $end_h=$arr_init_time['hours'];
        $end_m=$arr_init_time['minutes'];
        $end_s=$arr_init_time['seconds'] + 3; 
        if ($end_s >= 60) {
          $end_s-=60;
          $end_m++;
          if ($end_m >= 60) {
            $end_m-=60;
            $end_h++;
          }
        }
        $end_th=$arr_init_time['thousandths'];
      }
      if ($end_th < 0) {
        $end_th=9;
        $end_s--;
      }

      $srt_timecode=$i+1 . "\n" . $init_h . ":" . $init_m . ":" . $init_s . "," . $init_th . " --> " . $end_h . ":" . $end_m . ":" . $end_s . "," . $end_th  . "\n";
      
      $this->srt=str_replace($arr_timecodes[0][$i], $srt_timecode, $this->srt);
    }
  }

  public function setTransTimecode($trans_timecode) {
    $this->trans_timecode=$trans_timecode;
  }

  public function getTransTimecode() {
    return $this->trans_timecode;
  }
  
  public function setRegRtfTransProlog($reg_rtf_trans_prolog) {
    $this->reg_rtf_trans_prolog=$reg_rtf_trans_prolog;
  }

  public function getRegRtfTransProlog() {
    return $this->reg_rtf_trans_prolog;
  }

  public function setArrRtfSpecialChars(Array $arr_rtf_special_chars) {
    $this->arr_rtf_special_chars=$arr_rtf_special_chars;
  }

  public function getArrRtfSpecialChars() {
    return $this->arr_rtf_special_chars;
  }

  public function setRegCommands($reg_commands) {
    $this->reg_commands=$reg_commands;
  }

  public function getRegCommands() {
    return $this->reg_commands;
  }
  
  public function setArrRegFormat(Array $arr_reg_format) {
    $this->arr_format=$arr_reg_format;
  }

  public function getArrRegFormat() {
    return $this->arr_reg_format;
  }

  public function setRegTimecodeParentheses($reg_timecode_parentheses) {
    $this->reg_timecode_parentheses=$reg_timecode_parentheses;
  }

  public function getRegTimecodeParentheses() {
    return $this->reg_timecode_parentheses;
  }
  
  public function setRegThousandths($reg_thousandths) {
    $this->reg_thousandths=$reg_thousandths;
  }

  public function getRegThousandths() {
    return $this->reg_thousandths;
  }  

  public function setNextframeDelay($nextframe_delay) {
    $this->nextframe_delay=$nextframe_delay;
  }

  public function getNextframeDelay() {
    return $this->nextframe_delay;
  }    

  public function setTimecodeThousandthsDigits($timecode_thousandths_digits) {
    $this->timecode_thousandths_digits=$timecode_thousandths_digits;
  }

  public function getTimecodeThousandthsDigits() {
    return $this->timecode_thousandths_digits;
  }    

  public function getSrt() {
    $this->rtf2srt();
    return $this->srt;
  }


  function __construct($rtf_filename, $keep_formatting=FALSE) {
    $this->trans_timecode='\\\\\'' . $this->trans_timecode_code . '(?=<0>)';
    //Matches the RTF prolog: from the beginning of the file until the 
    //first transana timecode
    $this->reg_rtf_trans_prolog='/\{\\\\rtf([0-9])(.*)' . $this->trans_timecode . "/ism";
    //Sets up $arr_reg_format
    $this->reg_commands='\{ò\\\\fs20\\\\cf0\\\\cb2';
    $this->reg_formatted_content='([^\\}])*';
    $this->reg_post_format='\\}';
    $this->reg_bold_format='\}?\\\\b ';
    $this->reg_italic_format='\}?\\\\i ';
    $this->reg_underline_format='\}?\\\\ul ';
    if ($keep_formatting) {
      $arr_formatting_replacement=array('<b>$1</b>', '<i>$1</i>', '<u>$1</u>');
    } else {
      $arr_formatting_replacement=array('$1', '$1', '$1');
    }
    $this->arr_reg_format=array(
      array('/' . $this->reg_bold_format . '(' . $this->reg_formatted_content . ')' . $this->reg_post_format . '/ism', 
      '/' .  $this->reg_italic_format . '(' .  $this->reg_formatted_content . ')' . $this->reg_post_format . '/ism', 
      '/' . $this->reg_underline_format . '(' .  $this->reg_formatted_content . ')' . $this->reg_post_format . '/ism'), $arr_formatting_replacement);
    $this->rtf=file_get_contents($rtf_filename);
  }
}
?>
