Ruby on Rails | Screencasts | Download | Documentation | Weblog | Community | Source

Ticket #5396: updated_multibyte_activesupport.diff

File updated_multibyte_activesupport.diff, 59.0 kB (added by manfred, 4 years ago)
  • activesupport/test/multibyte_conformance.rb

    old new  
     1require File.dirname(__FILE__) + '/abstract_unit' 
     2require 'open-uri' 
     3 
     4$KCODE = 'UTF8' 
     5UNIDATA_VERSION = "5.0.0" 
     6UNIDATA_URL = "http://www.unicode.org/Public/#{UNIDATA_VERSION}/ucd" 
     7UNIDATA_FILE = '/NormalizationTest.txt' 
     8CACHE_DIR = File.dirname(__FILE__) + '/cache' 
     9 
     10class Downloader 
     11  def self.download(from, to) 
     12    unless File.exist?(to) 
     13       $stderr.puts "Downloading #{from} to #{to}" 
     14       open(from) do |source| 
     15         File.open(to, 'w') do |target| 
     16           source.each_line do |l| 
     17             target.write l 
     18           end 
     19         end 
     20       end 
     21     end 
     22  end 
     23end 
     24 
     25class String 
     26  # Unicode Inspect returns the codepoints of the string in hex 
     27  def ui 
     28    "#{self} " + ("[%s]" % unpack("U*").map{|cp| cp.to_s(16) }.join(' ')) 
     29  end unless ''.respond_to?(:ui) 
     30end 
     31 
     32Dir.mkdir(CACHE_DIR) unless File.exists?(CACHE_DIR) 
     33Downloader.download(UNIDATA_URL + UNIDATA_FILE, CACHE_DIR + UNIDATA_FILE) 
     34 
     35module ConformanceTest 
     36  def test_normalizations_C 
     37    each_line_of_norm_tests do |*cols| 
     38      col1, col2, col3, col4, col5, comment = *cols 
     39       
     40      # CONFORMANCE: 
     41      # 1. The following invariants must be true for all conformant implementations 
     42      # 
     43      #    NFC 
     44      #      c2 ==  NFC(c1) ==  NFC(c2) ==  NFC(c3) 
     45      assert_equal col2.ui, @handler.normalize(col1, :c).ui, "Form C - Col 2 has to be NFC(1) - #{comment}" 
     46      assert_equal col2.ui, @handler.normalize(col2, :c).ui, "Form C - Col 2 has to be NFC(2) - #{comment}" 
     47      assert_equal col2.ui, @handler.normalize(col3, :c).ui, "Form C - Col 2 has to be NFC(3) - #{comment}" 
     48      # 
     49      #      c4 ==  NFC(c4) ==  NFC(c5) 
     50      assert_equal col4.ui, @handler.normalize(col4, :c).ui, "Form C - Col 4 has to be C(4) - #{comment}" 
     51      assert_equal col4.ui, @handler.normalize(col5, :c).ui, "Form C - Col 4 has to be C(5) - #{comment}" 
     52    end 
     53  end 
     54   
     55  def test_normalizations_D 
     56    each_line_of_norm_tests do |*cols| 
     57      col1, col2, col3, col4, col5, comment = *cols 
     58      # 
     59      #    NFD 
     60      #      c3 ==  NFD(c1) ==  NFD(c2) ==  NFD(c3) 
     61      assert_equal col3.ui, @handler.normalize(col1, :d).ui, "Form D - Col 3 has to be NFD(1) - #{comment}" 
     62      assert_equal col3.ui, @handler.normalize(col2, :d).ui, "Form D - Col 3 has to be NFD(2) - #{comment}" 
     63      assert_equal col3.ui, @handler.normalize(col3, :d).ui, "Form D - Col 3 has to be NFD(3) - #{comment}" 
     64      #      c5 ==  NFD(c4) ==  NFD(c5) 
     65      assert_equal col5.ui, @handler.normalize(col4, :d).ui, "Form D - Col 5 has to be NFD(4) - #{comment}" 
     66      assert_equal col5.ui, @handler.normalize(col5, :d).ui, "Form D - Col 5 has to be NFD(5) - #{comment}" 
     67    end 
     68  end 
     69   
     70  def test_normalizations_KC 
     71    each_line_of_norm_tests do | *cols | 
     72      col1, col2, col3, col4, col5, comment = *cols   
     73      # 
     74      #    NFKC 
     75      #      c4 == NFKC(c1) == NFKC(c2) == NFKC(c3) == NFKC(c4) == NFKC(c5) 
     76      assert_equal col4.ui, @handler.normalize(col1, :kc).ui, "Form D - Col 4 has to be NFKC(1) - #{comment}" 
     77      assert_equal col4.ui, @handler.normalize(col2, :kc).ui, "Form D - Col 4 has to be NFKC(2) - #{comment}" 
     78      assert_equal col4.ui, @handler.normalize(col3, :kc).ui, "Form D - Col 4 has to be NFKC(3) - #{comment}" 
     79      assert_equal col4.ui, @handler.normalize(col4, :kc).ui, "Form D - Col 4 has to be NFKC(4) - #{comment}" 
     80      assert_equal col4.ui, @handler.normalize(col5, :kc).ui, "Form D - Col 4 has to be NFKC(5) - #{comment}" 
     81    end 
     82  end 
     83   
     84  def test_normalizations_KD 
     85    each_line_of_norm_tests do | *cols | 
     86      col1, col2, col3, col4, col5, comment = *cols   
     87      # 
     88      #    NFKD 
     89      #      c5 == NFKD(c1) == NFKD(c2) == NFKD(c3) == NFKD(c4) == NFKD(c5) 
     90      assert_equal col5.ui, @handler.normalize(col1, :kd).ui, "Form KD - Col 5 has to be NFKD(1) - #{comment}" 
     91      assert_equal col5.ui, @handler.normalize(col2, :kd).ui, "Form KD - Col 5 has to be NFKD(2) - #{comment}" 
     92      assert_equal col5.ui, @handler.normalize(col3, :kd).ui, "Form KD - Col 5 has to be NFKD(3) - #{comment}" 
     93      assert_equal col5.ui, @handler.normalize(col4, :kd).ui, "Form KD - Col 5 has to be NFKD(4) - #{comment}" 
     94      assert_equal col5.ui, @handler.normalize(col5, :kd).ui, "Form KD - Col 5 has to be NFKD(5) - #{comment}" 
     95    end 
     96  end 
     97   
     98  protected 
     99    def each_line_of_norm_tests(&block) 
     100      lines = 0 
     101      max_test_lines = 0 # Don't limit below 38, because that's the header of the testfile 
     102      File.open(File.dirname(__FILE__) + '/cache' + UNIDATA_FILE, 'r') do | f | 
     103        until f.eof? || (max_test_lines > 38 and lines > max_test_lines) 
     104          lines += 1 
     105          line = f.gets.chomp! 
     106          next if (line.empty? || line =~ /^\#/)       
     107           
     108          cols, comment = line.split("#") 
     109          cols = cols.split(";").map{|e| e.strip}.reject{|e| e.empty? } 
     110          next unless cols.length == 5 
     111           
     112          # codepoints are in hex in the test suite, pack wants them as integers 
     113          cols.map!{|c| c.split.map{|codepoint| codepoint.to_i(16)}.pack("U*") } 
     114          cols << comment 
     115           
     116          yield(*cols) 
     117        end 
     118      end 
     119    end 
     120end 
     121 
     122begin 
     123  require_library_or_gem('utf8proc_native') 
     124  require 'active_record/multibyte/handlers/utf8_handler_proc' 
     125  class ConformanceTestProc < Test::Unit::TestCase 
     126    include ConformanceTest 
     127    def setup 
     128      @handler = ::ActiveSupport::Multibyte::Handlers::UTF8HandlerProc 
     129    end 
     130  end 
     131rescue LoadError 
     132end 
     133 
     134class ConformanceTestPure < Test::Unit::TestCase 
     135  include ConformanceTest 
     136  def setup 
     137    @handler = ::ActiveSupport::Multibyte::Handlers::UTF8Handler 
     138  end 
     139end 
  • activesupport/test/multibyte_chars_test.rb

    old new  
     1require File.dirname(__FILE__) + '/abstract_unit' 
     2 
     3$KCODE = 'UTF8' 
     4 
     5class CharsTest < Test::Unit::TestCase 
     6   
     7  def setup 
     8    @s = { 
     9      :utf8 => "Abcd Блå ffi блa  埋", 
     10      :ascii => "asci ias c iia s", 
     11      :bytes => "\270\236\010\210\245" 
     12    } 
     13  end 
     14   
     15  def test_sanity 
     16    @s.each do |t, s| 
     17      assert s.respond_to?(:chars), "All string should have the chars method (#{t})" 
     18      assert s.respond_to?(:to_s), "All string should have the to_s method (#{t})" 
     19      assert_kind_of ActiveSupport::Multibyte::Chars, s.chars, "#chars should return an instance of Chars (#{t})" 
     20    end 
     21  end 
     22   
     23  def test_comparability 
     24    @s.each do |t, s| 
     25      assert_equal s, s.chars.to_s, "Chars#to_s should return enclosed string unchanged" 
     26    end 
     27    assert_nothing_raised do 
     28      assert_equal "a", "a", "Normal string comparisons should be unaffected" 
     29      assert_not_equal "a", "b", "Normal string comparisons should be unaffected" 
     30      assert_not_equal "a".chars, "b".chars, "Chars objects should be comparable" 
     31      assert_equal "a".chars, "A".downcase.chars, "Chars objects should be comparable to each other" 
     32      assert_equal "a".chars, "A".downcase, "Chars objects should be comparable to strings coming from elsewhere" 
     33    end 
     34     
     35    assert !@s[:utf8].eql?(@s[:utf8].chars), "Strict comparison is not supported" 
     36    assert_equal @s[:utf8], @s[:utf8].chars, "Chars should be compared by their enclosed string" 
     37 
     38    other_string = @s[:utf8].dup 
     39    assert_equal other_string, @s[:utf8].chars, "Chars should be compared by their enclosed string" 
     40    assert_equal other_string.chars, @s[:utf8].chars, "Chars should be compared by their enclosed string" 
     41     
     42    strings = ['builder'.chars, 'armor'.chars, 'zebra'.chars] 
     43    strings.sort! 
     44    assert_equal ['armor', 'builder', 'zebra'], strings, "Chars should be sortable based on their enclosed string" 
     45 
     46    # This leads to a StackLevelTooDeep exception if the comparison is not wired properly 
     47    assert_raise(NameError) do 
     48      Chars 
     49    end 
     50  end 
     51   
     52  def test_utf8? 
     53    assert @s[:utf8].is_utf8?, "UTF-8 strings are UTF-8" 
     54    assert @s[:ascii].is_utf8?, "All ASCII strings are also valid UTF-8" 
     55    assert !@s[:bytes].is_utf8?, "This bytestring isn't UTF-8" 
     56  end 
     57   
     58  # The test for the following methods are defined here because they can only be defined on the Chars class for 
     59  # various reasons  
     60   
     61  def test_gsub 
     62    assert_equal 'éxa', 'éda'.chars.gsub(/d/, 'x') 
     63    with_kcode('none') do 
     64      assert_equal 'éxa', 'éda'.chars.gsub(/d/, 'x') 
     65    end 
     66  end 
     67   
     68  def test_split 
     69    word = "efficient" 
     70    chars = ["e", "ffi", "c", "i", "e", "n", "t"] 
     71    assert_equal chars, word.split(//) 
     72    assert_equal chars, word.chars.split(//) 
     73    assert_kind_of ActiveSupport::Multibyte::Chars, word.chars.split(//).first, "Split should return Chars instances" 
     74  end 
     75   
     76  def test_regexp 
     77    with_kcode('none') do 
     78      assert_equal 12, (@s[:utf8].chars =~ /ffi/), 
     79        "Regex matching should be bypassed to String" 
     80    end 
     81    with_kcode('UTF8') do 
     82      assert_equal 9, (@s[:utf8].chars =~ /ffi/), 
     83        "Regex matching should be unicode aware" 
     84    end 
     85  end 
     86   
     87  def test_pragma 
     88    with_kcode('UTF8') do 
     89      assert " ".chars.send(:utf8_pragma?), "UTF8 pragma should be on because KCODE is UTF8" 
     90    end 
     91    with_kcode('none') do 
     92      assert !" ".chars.send(:utf8_pragma?), "UTF8 pragma should be off" 
     93    end 
     94  end 
     95   
     96  def test_handler_setting 
     97    handler = ''.chars.handler 
     98     
     99    ActiveSupport::Multibyte::Chars.handler = :first 
     100    assert_equal :first, ''.chars.handler 
     101    ActiveSupport::Multibyte::Chars.handler = :second 
     102    assert_equal :second, ''.chars.handler 
     103    assert_raise(NoMethodError) do 
     104      ''.chars.handler.split 
     105    end 
     106     
     107    ActiveSupport::Multibyte::Chars.handler = handler 
     108  end 
     109   
     110  def test_method_chaining 
     111    assert_kind_of ActiveSupport::Multibyte::Chars, ''.chars.downcase 
     112    assert_kind_of ActiveSupport::Multibyte::Chars, ''.chars.strip, "Strip should return a Chars object" 
     113    assert_kind_of ActiveSupport::Multibyte::Chars, ''.chars.downcase.strip, "The Chars object should be " + 
     114        "forwarded down the call path for chaining" 
     115    assert_equal 'foo', "  FOO   ".chars.normalize.downcase.strip, "The Chars that results from the " + 
     116      " operations should be comparable to the string value of the result" 
     117  end 
     118   
     119  def test_passthrough_on_kcode 
     120    # The easiest way to check if the passthrough is in place is through #size 
     121    with_kcode('nonce') do 
     122      assert_equal 26, @s[:utf8].chars.size 
     123    end 
     124    with_kcode('UTF8') do 
     125      assert_equal 17, @s[:utf8].chars.size 
     126    end 
     127  end 
     128     
     129  def test_destructiveness   
     130    # Note that we're testing the destructiveness here and not the correct behaviour of the methods 
     131    str = 'ac' 
     132    str.chars.insert(1, 'b') 
     133    assert_equal 'abc', str, 'Insert should be destructive for a string' 
     134     
     135    str = 'ac' 
     136    str.chars.reverse! 
     137    assert_equal 'ca', str, 'reverse! should be destructive for a string' 
     138  end 
     139   
     140  def test_resilience 
     141    assert_nothing_raised do 
     142      assert_equal 1, @s[:bytes].chars.size, "There's only one valid utf-8 byte in the string" 
     143    end 
     144    assert_nothing_raised do 
     145      assert_equal "\010", @s[:bytes].chars.reverse, "There's only one valid utf-8 byte in the string" 
     146    end 
     147    assert_nothing_raised do 
     148      @s[:bytes].chars.reverse! 
     149      assert_equal "\010", @s[:bytes], "There's only one valid utf-8 byte in the string" 
     150    end 
     151  end 
     152   
     153  protected 
     154 
     155  def with_kcode(kcode) 
     156    old_kcode, $KCODE = $KCODE, kcode 
     157    begin 
     158      yield 
     159    ensure 
     160      $KCODE = old_kcode 
     161    end 
     162  end 
     163end 
  • activesupport/test/multibyte_handler_test.rb

    old new  
     1require File.dirname(__FILE__) + '/abstract_unit' 
     2 
     3$KCODE = 'UTF8' 
     4 
     5class String 
     6  # Unicode Inspect returns the codepoints of the string in hex 
     7  def ui 
     8    "#{self} " + ("[%s]" % unpack("U*").map{|cp| cp.to_s(16) }.join(' ')) 
     9  end unless ''.respond_to?(:ui) 
     10end 
     11 
     12module UTF8HandlingTest 
     13   
     14  def common_setup 
     15    # This is an ASCII string with some russian strings and a ligature. It's nicely calibrated, because 
     16    # slicing it at some specific bytes will kill your characters if you use standard Ruby routines. 
     17    # It has both capital and standard letters, so that we can test case conversions easily. 
     18    # It has 26 charactes and 28 when the ligature gets split during normalization. 
     19    @string =     "Abcd Блå ffi бла бла бла бла" 
     20    @string_kd =  "Abcd Блå ffi бла бла бла бла" 
     21    @string_kc =  "Abcd Блå ffi бла бла бла бла" 
     22    @string_c =   "Abcd Блå ffi бла бла бла бла" 
     23    @string_d =   "Abcd Блå ffi бла бла бла бла" 
     24    @bytestring = "\270\236\010\210\245" # Not UTF-8 
     25     
     26    # Characters from the character classes as described in UAX #29 
     27    @character_from_class = { 
     28      :l => 0x1100, :v => 0x1160, :t => 0x11A8, :lv => 0xAC00, :lvt => 0xAC01, :cr => 0x000D, :lf => 0x000A, 
     29      :extend => 0x094D, :n => 0x64 
     30    } 
     31  end 
     32   
     33  def test_utf8_recognition 
     34    assert ActiveSupport::Multibyte::Handlers::UTF8Handler.consumes?(@string), 
     35      "Should recognize as a valid UTF-8 string" 
     36    assert !ActiveSupport::Multibyte::Handlers::UTF8Handler.consumes?(@bytestring), "This is bytestring, not UTF-8" 
     37  end 
     38   
     39  def test_simple_normalization 
     40    null_byte_str = "Test\0test" 
     41     
     42    assert_equal '', @handler.normalize(''), "Empty string should not break things" 
     43    assert_equal null_byte_str.ui, @handler.normalize(null_byte_str, :kc).ui, "Null byte should remain" 
     44    assert_equal null_byte_str.ui, @handler.normalize(null_byte_str, :c).ui, "Null byte should remain"  
     45    assert_equal null_byte_str.ui, @handler.normalize(null_byte_str, :d).ui, "Null byte should remain" 
     46    assert_equal null_byte_str.ui, @handler.normalize(null_byte_str, :kd).ui, "Null byte should remain" 
     47    assert_equal null_byte_str.ui, @handler.decompose(null_byte_str).ui, "Null byte should remain" 
     48    assert_equal null_byte_str.ui, @handler.compose(null_byte_str).ui, "Null byte should remain"  
     49     
     50    comp_str = [ 
     51      44,  # LATIN CAPITAL LETTER D 
     52      307, # COMBINING DOT ABOVE 
     53      328, # COMBINING OGONEK 
     54      323 # COMBINING DOT BELOW 
     55    ].pack("U*") 
     56    norm_str_KC = [44,105,106,328,323].pack("U*") 
     57    norm_str_C = [44,307,328,323].pack("U*") 
     58    norm_str_D = [44,307,110,780,78,769].pack("U*") 
     59    norm_str_KD = [44,105,106,110,780,78,769].pack("U*") 
     60     
     61    assert_equal norm_str_KC.ui, @handler.normalize(comp_str, :kc).ui, "Should normalize KC" 
     62    assert_equal norm_str_C.ui, @handler.normalize(comp_str, :c).ui, "Should normalize C" 
     63    assert_equal norm_str_D.ui, @handler.normalize(comp_str, :d).ui, "Should normalize D" 
     64    assert_equal norm_str_KD.ui, @handler.normalize(comp_str, :kd).ui, "Should normalize KD" 
     65     
     66    assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.normalize(@bytestring) } 
     67  end 
     68   
     69  # Test for the Public Review Issue #29, bad explaination of composition might lead to a 
     70  # bad implementation: http://www.unicode.org/review/pr-29.html 
     71  def test_normalization_C_pri_29 
     72    [ 
     73      [0x0B47, 0x0300, 0x0B3E], 
     74      [0x1100, 0x0300, 0x1161] 
     75    ].map { |c| c.pack('U*') }.each do |c| 
     76      assert_equal c.ui, @handler.normalize(c, :c).ui, "Composition is implemented incorrectly" 
     77    end 
     78  end 
     79   
     80  def test_casefolding 
     81    simple_str = "abCdef" 
     82    simple_str_upcase = "ABCDEF" 
     83    simple_str_downcase = "abcdef" 
     84     
     85    assert_equal '', @handler.downcase(@handler.upcase('')), "Empty string should not break things" 
     86    assert_equal simple_str_upcase, @handler.upcase(simple_str), "should upcase properly" 
     87    assert_equal simple_str_downcase, @handler.downcase(simple_str), "should downcase properly" 
     88    assert_equal simple_str_downcase, @handler.downcase(@handler.upcase(simple_str_downcase)), "upcase and downcase should be mirrors" 
     89     
     90    rus_str = "аБвгЎ\0f" 
     91    rus_str_upcase = "АБВГД\0F" 
     92    rus_str_downcase = "абвгЎ\0f" 
     93    assert_equal rus_str_upcase, @handler.upcase(rus_str), "should upcase properly honoring null-byte" 
     94    assert_equal rus_str_downcase, @handler.downcase(rus_str), "should downcase properly honoring null-byte" 
     95     
     96    jap_str = "の埋め蟌み化察応はほが完成" 
     97    assert_equal jap_str, @handler.upcase(jap_str), "Japanse has no upcase, should remain unchanged" 
     98    assert_equal jap_str, @handler.downcase(jap_str), "Japanse has no downcase, should remain unchanged" 
     99     
     100    assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.upcase(@bytestring) } 
     101  end 
     102   
     103  def test_capitalize 
     104    { 'аБвг аБвг' => 'Абвг абвг', 
     105      'аБвг АБВГ' => 'Абвг абвг', 
     106      'АБВГ АБВГ' => 'Абвг абвг', 
     107      '' => '' }.each do |f,t| 
     108        assert_equal t, @handler.capitalize(f), "Capitalize should work as expected" 
     109    end 
     110    assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.capitalize(@bytestring) } 
     111  end 
     112   
     113  def test_translate_offset 
     114    str = "БлaÃ¥" # [2, 2, 1, 2] bytes 
     115    assert_equal 0, @handler.translate_offset('', 0), "Offset for an empty string makes no sense, return 0" 
     116    assert_equal 0, @handler.translate_offset(str, 0), "First character, first byte" 
     117    assert_equal 0, @handler.translate_offset(str, 1), "First character, second byte" 
     118    assert_equal 1, @handler.translate_offset(str, 2), "Second character, third byte" 
     119    assert_equal 1, @handler.translate_offset(str, 3), "Second character, fourth byte" 
     120    assert_equal 2, @handler.translate_offset(str, 4), "Third character, fifth byte" 
     121    assert_equal 3, @handler.translate_offset(str, 5), "Fourth character, sixth byte" 
     122    assert_equal 3, @handler.translate_offset(str, 6), "Fourth character, seventh byte" 
     123    assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.translate_offset(@bytestring, 3) } 
     124  end 
     125   
     126  def test_insert 
     127    assert_equal '', @handler.insert('', 0, ''), "Empty string should not break things" 
     128    assert_equal "Abcd Блå ffiБУМ бла бла бла бла", @handler.insert(@string, 10, "БУМ"),  
     129      "Text should be inserted at right codepoints" 
     130    assert_equal "Abcd Блå ffiБУМ бла бла бла бла", @string, "Insert should be destructive" 
     131    assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) do 
     132      @handler.insert(@bytestring, 2, "\210") 
     133    end 
     134  end 
     135   
     136  def test_reverse 
     137    str = "wБлåa \n" 
     138    rev = "\n aåлБw" 
     139    assert_equal '', @handler.reverse(''), "Empty string shouldn't change" 
     140    assert_equal rev.ui, @handler.reverse(str).ui, "Should reverse properly" 
     141    assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.reverse(@bytestring) } 
     142  end 
     143   
     144  def test_size 
     145    assert_equal 0, @handler.size(''), "Empty string has size 0" 
     146    assert_equal 26, @handler.size(@string), "String length should be 26" 
     147    assert_equal 26, @handler.length(@string), "String length method should be properly aliased" 
     148    assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.size(@bytestring) } 
     149  end 
     150   
     151  def test_slice 
     152    assert_equal 0x41, @handler.slice(@string, 0), "Singular characters should return codepoints" 
     153    assert_equal 0xE5, @handler.slice(@string, 7), "Singular characters should return codepoints" 
     154    assert_equal nil, @handler.slice('', -1..1), "Broken range should return nil" 
     155    assert_equal '', @handler.slice('', 0..10), "Empty string should not break things" 
     156    assert_equal "d Блå ffi", @handler.slice(@string, 3..9), "Unicode characters have to be returned" 
     157    assert_equal " Блå ffi ", @handler.slice(@string, 4..10), "Unicode characters have to be returned" 
     158    assert_equal "", @handler.slice(@string, 7..6), "Range is empty, should return an empty string" 
     159    assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.slice(@bytestring, 2..3) } 
     160  end 
     161   
     162  def test_grapheme_cluster_length 
     163    assert_equal 0, @handler.g_length(''), "String should count 0 grapheme clusters" 
     164    assert_equal 2, @handler.g_length([0x0924, 0x094D, 0x0930].pack('U*')), "String should count 2 grapheme clusters" 
     165    assert_equal 1, @handler.g_length(string_from_classes(%w(cr lf))), "Don't cut between CR and LF" 
     166    assert_equal 1, @handler.g_length(string_from_classes(%w(l l))), "Don't cut between L" 
     167    assert_equal 1, @handler.g_length(string_from_classes(%w(l v))), "Don't cut between L and V" 
     168    assert_equal 1, @handler.g_length(string_from_classes(%w(l lv))), "Don't cut between L and LV" 
     169    assert_equal 1, @handler.g_length(string_from_classes(%w(l lvt))), "Don't cut between L and LVT" 
     170    assert_equal 1, @handler.g_length(string_from_classes(%w(lv v))), "Don't cut between LV and V" 
     171    assert_equal 1, @handler.g_length(string_from_classes(%w(lv t))), "Don't cut between LV and T" 
     172    assert_equal 1, @handler.g_length(string_from_classes(%w(v v))), "Don't cut between V and V" 
     173    assert_equal 1, @handler.g_length(string_from_classes(%w(v t))), "Don't cut between V and T" 
     174    assert_equal 1, @handler.g_length(string_from_classes(%w(lvt t))), "Don't cut between LVT and T" 
     175    assert_equal 1, @handler.g_length(string_from_classes(%w(t t))), "Don't cut between T and T" 
     176    assert_equal 1, @handler.g_length(string_from_classes(%w(n extend))), "Don't cut before Extend" 
     177    assert_equal 2, @handler.g_length(string_from_classes(%w(n n))), "Cut between normal characters" 
     178    assert_equal 3, @handler.g_length(string_from_classes(%w(n cr lf n))), "Don't cut between CR and LF" 
     179    assert_equal 2, @handler.g_length(string_from_classes(%w(n l v t))), "Don't cut between L, V and T" 
     180    assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.g_length(@bytestring) } 
     181  end 
     182   
     183  def test_index 
     184     s = "ΚαληΌέρα κόσΌε!" 
     185     assert_equal 0, @handler.index('', ''), "The empty string is always found at the beginning of the string" 
     186     assert_equal 0, @handler.index('haystack', ''), "The empty string is always found at the beginning of the string" 
     187     assert_equal 0, @handler.index(s, 'Κ'), "Greek K is at 0" 
     188     assert_equal 1, @handler.index(s, 'α'), "Greek Alpha is at 1" 
     189      
     190     assert_equal nil, @handler.index(@bytestring, 'a') 
     191     assert_raise(ActiveSupport::Multibyte::Handlers::EncodingError) { @handler.index(@bytestring, "\010") } 
     192  end 
     193   
     194  def test_strip 
     195    # A unicode aware version of strip should strip all 26 types of whitespace. This includes the NO BREAK SPACE 
     196    # aka BOM (byte order mark). The byte order mark has no place in UTF-8 because it's used to detect LE and BE. 
     197    b = "\n" + [ 
     198      32, # SPACE 
     199      8195, # EM SPACE 
     200      8199, # FIGURE SPACE, 
     201      8201, # THIN SPACE 
     202      8202, # HAIR SPACE 
     203      65279, # NO BREAK SPACE (ZW) 
     204    ].pack('U*') 
     205    m = "word блОМ\n\n\n  word" 
     206    e = [ 
     207    65279, # NO BREAK SPACE (ZW) 
     208    8201, # THIN SPACE 
     209    8199, # FIGURE SPACE,       
     210    32, # SPACE 
     211    ].pack('U*') 
     212    string = b+m+e 
     213     
     214    assert_equal '', @handler.strip(''), "Empty string should stay empty" 
     215    assert_equal m+e, @handler.lstrip(string), "Whitespace should be gone on the left" 
     216    assert_equal b+m, @handler.rstrip(string), "Whitespace should be gone on the right" 
     217    assert_equal m, @handler.strip(string), "Whitespace should be stripped on both sides" 
     218     
     219    bs = "\n   #{@bytestring} \n\n" 
     220    assert_equal @bytestring, @handler.strip(bs), "Invalid unicode strings should still strip" 
     221  end 
     222   
     223  def test_tidy_bytes 
     224    assert_equal "\010", @handler.tidy_bytes(@bytestring) 
     225    assert_equal "a\010a", @handler.tidy_bytes('a' + @bytestring + 'a') 
     226    assert_nothing_raised { @handler.tidy_bytes(@bytestring).unpack('U*') } 
     227  end 
     228   
     229  protected 
     230   
     231  def string_from_classes(classes) 
     232    classes.collect do |k| 
     233      @character_from_class[k.intern] 
     234    end.pack('U*') 
     235  end 
     236end 
     237 
     238 
     239begin 
     240  require_library_or_gem('utf8proc_native') 
     241  require 'active_record/multibyte/handlers/utf8_handler_proc' 
     242  class UTF8HandlingTestProc < Test::Unit::TestCase 
     243    include UTF8HandlingTest 
     244    def setup 
     245      common_setup 
     246      @handler = ::ActiveSupport::Multibyte::Handlers::UTF8HandlerProc 
     247    end 
     248  end 
     249rescue LoadError 
     250end 
     251 
     252class UTF8HandlingTestPure < Test::Unit::TestCase 
     253  include UTF8HandlingTest 
     254  def setup 
     255    common_setup 
     256    @handler = ::ActiveSupport::Multibyte::Handlers::UTF8Handler 
     257  end 
     258end 
  • activesupport/Rakefile

    old new  
    2121  t.warning = true 
    2222} 
    2323 
     24desc 'Runs the conformance tests for unicode operations' 
     25task :test_conformance do 
     26  `ruby test/multibyte_conformance.rb` 
     27end 
     28 
    2429# Create compressed packages 
    2530dist_dirs = [ "lib", "test"] 
    2631 
     
    2934Rake::RDocTask.new { |rdoc| 
    3035  rdoc.rdoc_dir = 'doc' 
    3136  rdoc.title    = "Active Support -- Utility classes and standard library extensions from Rails" 
    32   rdoc.options << '--line-numbers' << '--inline-source' 
     37  rdoc.options << '--line-numbers' << '--inline-source' << '--charset=utf-8' 
    3338  rdoc.template = "#{ENV['template']}.rb" if ENV['template'] 
    3439  rdoc.rdoc_files.include('README', 'CHANGELOG') 
    3540  rdoc.rdoc_files.include('lib/active_support.rb') 
  • activesupport/lib/active_support/multibyte.rb

    old new  
     1module ActiveSupport::Multibyte 
     2  DEFAULT_NORMALIZATION_FORM = :kc 
     3  NORMALIZATIONS_FORMS = [:c, :kc, :d, :kd] 
     4end 
     5 
     6require 'active_support/multibyte/chars' 
  • activesupport/lib/active_support/core_ext/string.rb

    old new  
    33require File.dirname(__FILE__) + '/string/access' 
    44require File.dirname(__FILE__) + '/string/starts_ends_with' 
    55require File.dirname(__FILE__) + '/string/iterators' 
     6require File.dirname(__FILE__) + '/string/unicode' 
    67 
    78class String #:nodoc: 
    89  include ActiveSupport::CoreExtensions::String::Access 
     
    1011  include ActiveSupport::CoreExtensions::String::Inflections 
    1112  include ActiveSupport::CoreExtensions::String::StartsEndsWith 
    1213  include ActiveSupport::CoreExtensions::String::Iterators 
     14  include ActiveSupport::CoreExtensions::String::Unicode 
    1315end 
  • activesupport/lib/active_support/core_ext/string/unicode.rb

    old new  
     1module ActiveSupport #:nodoc: 
     2  module CoreExtensions #:nodoc: 
     3    module String #:nodoc: 
     4      # Define methods for handeling unicode data. 
     5      module Unicode 
     6        # +chars+ is a Unicode safe proxy for string methods. It creates and returns an instance of the 
     7        # ActiveSupport::Multibyte::Chars class which encapsulates the original string. A Unicode safe version of all 
     8        # the String methods are defined on this proxy class. Undefined methods are forwarded to String, so all of the 
     9        # string overrides can also be called through the +chars+ proxy. 
     10        # 
     11        #   name = 'Claus MÃŒller' 
     12        #   name.reverse #=> "rell??M sualC" 
     13        #   name.length #=> 13 
     14        # 
     15        #   name.chars.reverse.to_s #=> "rellÃŒM sualC" 
     16        #   name.chars.length #=> 12 
     17        #    
     18        # 
     19        # All the methods on the chars proxy which normally return a string will return a Chars object. This allows 
     20        # method chaining on the result of any of these methods. 
     21        # 
     22        #   name.chars.reverse.length #=> 12 
     23        # 
     24        # The Char object tries to be as interchangeable with String objects as possible: sorting and comparing between 
     25        # String and Char work like expected. The bang! methods change the internal string representation in the Chars 
     26        # object. Interoperability problems can be resolved easily with a +to_s+ call. 
     27        # 
     28        # For more information about the methods defined on the Chars proxy see ActiveSupport::Multibyte::Chars and 
     29        # ActiveSupport::Multibyte::Handlers::UTF8Handler 
     30        def chars 
     31          ActiveSupport::Multibyte::Chars.new(self) 
     32        end 
     33 
     34        # Returns true if the string has UTF-8 semantics (a String used for purely byte resources is unlikely to have 
     35        # them), returns false otherwise. 
     36        def is_utf8? 
     37          ActiveSupport::Multibyte::Handlers::UTF8Handler.consumes?(self) 
     38        end 
     39      end 
     40    end 
     41  end 
     42end 
  • activesupport/lib/active_support/multibyte/handlers/utf8_handler.rb

    old new  
     1# Contains all the handlers and helper classes 
     2module ActiveSupport::Multibyte::Handlers 
     3  class EncodingError < ArgumentError; end 
     4   
     5  class Codepoint #:nodoc: 
     6    attr_accessor :code, :combining_class, :decomp_type, :decomp_mapping, :uppercase_mapping, :lowercase_mapping 
     7  end 
     8   
     9  class UnicodeDatabase #:nodoc: 
     10    attr_accessor :codepoints, :composition_exclusion, :composition_map, :boundary 
     11     
     12    # Creates a new UnicodeDatabase instance and loads the database. 
     13    def initialize 
     14      begin 
     15        @codepoints, @composition_exclusion, @composition_map, @boundary = self.class.load 
     16      rescue Exception => e 
     17        $stderr.write "Couldn't load the unicode tables for UTF8Handler (#{e.to_s}), handler is unusable\n" 
     18      end 
     19      @codepoints ||= Hash.new(Codepoint.new) 
     20      @composition_exclusion ||= [] 
     21      @composition_map ||= {} 
     22      @boundary ||= {} 
     23       
     24      # Redefine the === method so we can write shorter rules for grapheme cluster breaks 
     25      @boundary.each do |k,_| 
     26        @boundary[k].instance_eval do 
     27          def ===(other) 
     28            detect { |i| i === other } ? true : false 
     29          end 
     30        end if @boundary[k].kind_of?(Array) 
     31      end 
     32    end 
     33     
     34    # Shortcut to ucd.codepoints[] 
     35    def [](index); @codepoints[index]; end 
     36     
     37    # Returns the filename of the unicode database 
     38    def self.filename 
     39      File.expand_path File.dirname(__FILE__) + '/../../values/unicode_tables.dat' 
     40    end 
     41     
     42    # Loads the unicode database and returns all the internal objects of UnicodeDatabase 
     43    def self.load; File.open(self.filename) { |f| Marshal.load f.read }; end 
     44  end 
     45   
     46  # UTF8Handler implements Unicode aware operations for strings, these operations will be used by the Chars 
     47  # proxy when $KCODE is set to 'UTF8'. 
     48  class UTF8Handler 
     49    # UniCode Database 
     50    UCD = UnicodeDatabase.new 
     51     
     52    # Hangul character boundaries and properties 
     53    HANGUL_SBASE = 0xAC00 
     54    HANGUL_LBASE = 0x1100 
     55    HANGUL_VBASE = 0x1161 
     56    HANGUL_TBASE = 0x11A7 
     57    HANGUL_LCOUNT = 19 
     58    HANGUL_VCOUNT = 21 
     59    HANGUL_TCOUNT = 28 
     60    HANGUL_NCOUNT = HANGUL_VCOUNT * HANGUL_TCOUNT 
     61    HANGUL_SCOUNT = 11172 
     62    HANGUL_SLAST = HANGUL_SBASE + HANGUL_SCOUNT 
     63    HANGUL_JAMO_FIRST = 0x1100 
     64    HANGUL_JAMO_LAST = 0x11FF 
     65     
     66    # All the unicode whitespace 
     67    UNICODE_WHITESPACE = [ 
     68      (0x0009..0x000D).to_a,  # White_Space # Cc   [5] <control-0009>..<control-000D> 
     69      0x0020,          # White_Space # Zs       SPACE 
     70      0x0085,          # White_Space # Cc       <control-0085> 
     71      0x00A0,          # White_Space # Zs       NO-BREAK SPACE 
     72      0x1680,          # White_Space # Zs       OGHAM SPACE MARK 
     73      0x180E,          # White_Space # Zs       MONGOLIAN VOWEL SEPARATOR 
     74      (0x2000..0x200A).to_a, # White_Space # Zs  [11] EN QUAD..HAIR SPACE 
     75      0x2028,          # White_Space # Zl       LINE SEPARATOR 
     76      0x2029,          # White_Space # Zp       PARAGRAPH SEPARATOR 
     77      0x202F,          # White_Space # Zs       NARROW NO-BREAK SPACE 
     78      0x205F,          # White_Space # Zs       MEDIUM MATHEMATICAL SPACE 
     79      0x3000,          # White_Space # Zs       IDEOGRAPHIC SPACE 
     80    ].flatten.freeze 
     81     
     82    # BOM (byte order mark) can also be seen as whitespace, it's a non-rendering character used to distinguish 
     83    # between little and big endian. This is not an issue in utf-8, so it must be ignored. 
     84    UNICODE_LEADERS_AND_TRAILERS = UNICODE_WHITESPACE + [65279] # ZERO-WIDTH NO-BREAK SPACE aka BOM 
     85     
     86    # Borrowed from the Kconv library by Shinji KONO - (also as seen on the W3C site) 
     87     UTF8_PAT = /\A(?: 
     88                   [\x00-\x7f]                                     | 
     89                   [\xc2-\xdf] [\x80-\xbf]                         | 
     90                   \xe0        [\xa0-\xbf] [\x80-\xbf]             | 
     91                   [\xe1-\xef] [\x80-\xbf] [\x80-\xbf]             | 
     92                   \xf0        [\x90-\xbf] [\x80-\xbf] [\x80-\xbf] | 
     93                   [\xf1-\xf3] [\x80-\xbf] [\x80-\xbf] [\x80-\xbf] | 
     94                   \xf4        [\x80-\x8f] [\x80-\xbf] [\x80-\xbf] 
     95                  )*\z/xn 
     96     
     97    # Returns a regular expression pattern that matches the passed Unicode codepoints 
     98    def self.codepoints_to_pattern(array_of_codepoints) #:nodoc: 
     99      array_of_codepoints.collect{ |e| [e].pack 'U*' }.join('|')  
     100    end 
     101    UNICODE_TRAILERS_PAT = /(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+\Z/ 
     102    UNICODE_LEADERS_PAT = /\A(#{codepoints_to_pattern(UNICODE_LEADERS_AND_TRAILERS)})+/ 
     103     
     104    class << self 
     105       
     106      # /// 
     107      # /// BEGIN String method overrides 
     108      # /// 
     109       
     110      # Inserts the passed string at specified codepoint offsets 
     111      def insert(str, offset, fragment) 
     112        str.replace( 
     113          u_unpack(str).insert( 
     114            offset, 
     115            u_unpack(fragment) 
     116          ).flatten.pack('U*') 
     117        ) 
     118      end 
     119       
     120      # Returns the position of the passed argument in the string, counting in codepoints 
     121      def index(str, *args) 
     122        bidx = str.index(*args) 
     123        bidx ? (u_unpack(str.slice(0...bidx)).size) : nil 
     124      end 
     125       
     126      # Does Unicode-aware rstrip 
     127      def rstrip(str) 
     128        str.gsub(UNICODE_TRAILERS_PAT, '') 
     129      end 
     130       
     131      # Does Unicode-aware lstrip 
     132      def lstrip(str) 
     133        str.gsub(UNICODE_LEADERS_PAT, '') 
     134      end 
     135       
     136      # Removed leading and trailing whitespace 
     137      def strip(str) 
     138        str.gsub(UNICODE_LEADERS_PAT, '').gsub(UNICODE_TRAILERS_PAT, '') 
     139      end 
     140       
     141      # Returns the number of codepoints in the string 
     142      def size(str) 
     143        u_unpack(str).size 
     144      end 
     145      alias_method :length, :size 
     146       
     147      # Reverses codepoints in the string. 
     148      def reverse(str) 
     149        u_unpack(str).reverse.pack('U*') 
     150      end 
     151       
     152      # Implements Unicode-aware slice with codepoints. Slicing on one point returns the codepoints for that 
     153      # character. 
     154      def slice(str, *args) 
     155        if (args.size == 2 && args.first.is_a?(Range)) 
     156          raise TypeError, 'cannot convert Range into Integer' # Do as if we were native 
     157        elsif args[0].kind_of? Range 
     158          cps = u_unpack(str).slice(*args) 
     159          cps.nil? ? nil : cps.pack('U*') 
     160        elsif args[0].kind_of? Numeric 
     161          u_unpack(str)[args[0]] 
     162        else 
     163          str.slice(*args) 
     164        end 
     165      end 
     166      alias_method :[], :slice 
     167       
     168      # Convert characters in the string to uppercase 
     169      def upcase(str); to_case :uppercase_mapping, str; end 
     170       
     171      # Convert characters in the string to lowercase 
     172      def downcase(str); to_case :lowercase_mapping, str; end 
     173       
     174      # Returns a copy of +str+ with the first character converted to uppercase and the remainder to lowercase 
     175      def capitalize(str) 
     176        upcase(slice(str, 0..0)) + downcase(slice(str, 1..-1) || '') 
     177      end 
     178       
     179      # /// 
     180      # /// Extra String methods for unicode operations 
     181      # /// 
     182       
     183      # Returns the KC normalization of the string by default. NFKC is considered the best normalization form for 
     184      # passing strings to databases and validations. 
     185      # 
     186      # * <tt>str</tt>: The string to perform normalization on. 
     187      # * <tt>form</tt>: The form you want to normalize in. Should be one of the following: :c, :kc, :d or :kd. 
     188      def normalize(str, form=ActiveSupport::Multibyte::DEFAULT_NORMALIZATION_FORM) 
     189        # See http://www.unicode.org/reports/tr15, Table 1 
     190        codepoints = u_unpack(str) 
     191        case form 
     192          when :d 
     193            reorder_characters(decompose_codepoints(:canonical, codepoints)) 
     194          when :c 
     195            compose_codepoints reorder_characters(decompose_codepoints(:canonical, codepoints)) 
     196          when :kd 
     197            reorder_characters(decompose_codepoints(:compatability, codepoints)) 
     198          when :kc 
     199            compose_codepoints reorder_characters(decompose_codepoints(:compatability, codepoints)) 
     200          else 
     201            raise ArgumentError, "#{form} is not a valid normalization variant", caller 
     202        end.pack('U*') 
     203      end 
     204       
     205      # Perform decomposition on the characters in the string 
     206      def decompose(str) 
     207        decompose_codepoints(:canonical, u_unpack(str)).pack('U*') 
     208      end 
     209       
     210      # Perform composition on the characters in the string 
     211      def compose(str) 
     212        compose_codepoints u_unpack(str).pack('U*') 
     213      end 
     214       
     215      # /// 
     216      # /// BEGIN Helper methods for unicode operation 
     217      # /// 
     218       
     219      # Used to translate an offset from bytes to characters, for instance one received from a regular expression match 
     220      def translate_offset(str, byte_offset) 
     221        return 0 if str == '' 
     222        return nil if byte_offset.nil? 
     223        chunk = str[0..byte_offset] 
     224        begin 
     225          begin 
     226            chunk.unpack('U*').length - 1 
     227          rescue ArgumentError => e 
     228            chunk = str[0..(byte_offset+=1)] 
     229            # Stop retrying at the end of the string 
     230            raise e unless byte_offset < chunk.length  
     231            # We damaged a character, retry 
     232            retry 
     233          end 
     234        # Catch the ArgumentError so we can throw our own 
     235        rescue ArgumentError  
     236          raise EncodingError.new('malformed UTF-8 character') 
     237        end 
     238      end 
     239       
     240      # Checks if the string is valid UTF8. 
     241      def consumes?(str) 
     242        # Unpack is a little bit faster than regular expressions 
     243        begin 
     244          str.unpack('U*') 
     245          true 
     246        rescue ArgumentError 
     247          false 
     248        end 
     249      end 
     250       
     251      # Returns the number of grapheme clusters in the string. This method is very likely to be moved or renamed 
     252      # in future versions. 
     253      def g_length(str) 
     254        g_unpack(str).length 
     255      end 
     256       
     257      # Strips all the non-utf-8 bytes from the string resulting in a valid utf-8 string 
     258      def tidy_bytes(str) 
     259        str.split(//u).reject { |c| !UTF8_PAT.match(c) }.join 
     260      end 
     261       
     262      protected 
     263       
     264      # Detect whether the codepoint is in a certain character class. Primarily used by the 
     265      # grapheme cluster support. 
     266      def in_char_class?(codepoint, classes) 
     267        classes.detect { |c| UCD.boundary[c] === codepoint } ? true : false 
     268      end 
     269       
     270      # Unpack the string at codepoints boundaries 
     271      def u_unpack(str) 
     272        begin 
     273          str.unpack 'U*' 
     274        rescue ArgumentError 
     275          raise EncodingError.new('malformed UTF-8 character') 
     276        end 
     277      end 
     278       
     279      # Unpack the string at grapheme boundaries instead of codepoint boundaries 
     280      def g_unpack(str) 
     281        codepoints = u_unpack(str) 
     282        unpacked = [] 
     283        pos = 0 
     284        marker = 0 
     285        eoc = codepoints.length 
     286        while(pos < eoc) 
     287          pos += 1 
     288          previous = codepoints[pos-1] 
     289          current = codepoints[pos] 
     290          if ( 
     291              # CR X LF 
     292              one = ( previous == UCD.boundary[:cr] and current == UCD.boundary[:lf] ) or 
     293              # L X (L|V|LV|LVT) 
     294              two = ( UCD.boundary[:l] === previous and in_char_class?(current, [:l,:v,:lv,:lvt]) ) or 
     295              # (LV|V) X (V|T) 
     296              three = ( in_char_class?(previous, [:lv,:v]) and in_char_class?(current, [:v,:t]) ) or 
     297              # (LVT|T) X (T) 
     298              four = ( in_char_class?(previous, [:lvt,:t]) and UCD.boundary[:t] === current ) or 
     299              # X Extend 
     300              five = (UCD.boundary[:extend] === current) 
     301            ) 
     302          else 
     303            unpacked << codepoints[marker..pos-1] 
     304            marker = pos 
     305          end 
     306        end  
     307        unpacked 
     308      end 
     309       
     310      # Reverse operation of g_unpack 
     311      def g_pack(unpacked) 
     312        unpacked.flatten 
     313      end 
     314       
     315      # Convert characters to a different case 
     316      def to_case(way, str) 
     317        u_unpack(str).map do |codepoint| 
     318          cp = UCD[codepoint]  
     319          unless cp.nil? 
     320            ncp = cp.send(way) 
     321            ncp > 0 ? ncp : codepoint 
     322          else 
     323            codepoint 
     324          end 
     325        end.pack('U*') 
     326      end 
     327       
     328      # Re-order codepoints so the string becomes canonical 
     329      def reorder_characters(codepoints) 
     330        length = codepoints.length- 1 
     331        pos = 0 
     332        while pos < length do 
     333          cp1, cp2 = UCD[codepoints[pos]], UCD[codepoints[pos+1]] 
     334          if (cp1.combining_class > cp2.combining_class) && (cp2.combining_class > 0) 
     335            codepoints[pos..pos+1] = cp2.code, cp1.code 
     336            pos += (pos > 0 ? -1 : 1) 
     337          else 
     338            pos += 1 
     339          end 
     340        end 
     341        codepoints 
     342      end 
     343       
     344      # Decompose composed characters to the decomposed form 
     345      def decompose_codepoints(type, codepoints) 
     346        codepoints.inject([]) do |decomposed, cp| 
     347          # if it's a hangul syllable starter character 
     348          if HANGUL_SBASE <= cp and cp < HANGUL_SLAST 
     349            sindex = cp - HANGUL_SBASE 
     350            ncp = [] # new codepoints 
     351            ncp << HANGUL_LBASE + sindex / HANGUL_NCOUNT 
     352            ncp << HANGUL_VBASE + (sindex % HANGUL_NCOUNT) / HANGUL_TCOUNT 
     353            tindex = sindex % HANGUL_TCOUNT 
     354            ncp << (HANGUL_TBASE + tindex) unless tindex == 0 
     355            decomposed.concat ncp 
     356          # if the codepoint is decomposable in with the current decomposition type 
     357          elsif (ncp = UCD[cp].decomp_mapping) and (!UCD[cp].decomp_type || type == :compatability) 
     358            decomposed.concat decompose_codepoints(type, ncp.dup) 
     359          else 
     360            decomposed << cp 
     361          end 
     362        end 
     363      end 
     364       
     365      # Compose decomposed characters to the composed form 
     366      def compose_codepoints(codepoints) 
     367        pos = 0 
     368        eoa = codepoints.length - 1 
     369        starter_pos = 0 
     370        starter_char = codepoints[0] 
     371        previous_combining_class = -1 
     372        while pos < eoa 
     373          pos += 1 
     374          lindex = starter_char - HANGUL_LBASE 
     375          # -- Hangul 
     376          if 0 <= lindex and lindex < HANGUL_LCOUNT 
     377            vindex = codepoints[starter_pos+1] - HANGUL_VBASE rescue vindex = -1 
     378            if 0 <= vindex and vindex < HANGUL_VCOUNT 
     379              tindex = codepoints[starter_pos+2] - HANGUL_TBASE rescue tindex = -1 
     380              if 0 <= tindex and tindex < HANGUL_TCOUNT 
     381                j = starter_pos + 2 
     382                eoa -= 2 
     383              else 
     384                tindex = 0 
     385                j = starter_pos + 1 
     386                eoa -= 1 
     387              end 
     388              codepoints[starter_pos..j] = (lindex * HANGUL_VCOUNT + vindex) * HANGUL_TCOUNT + tindex + HANGUL_SBASE 
     389            end 
     390            starter_pos += 1 
     391            starter_char = codepoints[starter_pos] 
     392          # -- Other characters 
     393          else 
     394            current_char = codepoints[pos] 
     395            current = UCD[current_char] 
     396            if current.combining_class > previous_combining_class 
     397              if ref = UCD.composition_map[starter_char] 
     398                composition = ref[current_char] 
     399              else 
     400                composition = nil 
     401              end 
     402              unless composition.nil? 
     403                codepoints[starter_pos] = composition 
     404                starter_char = composition 
     405                codepoints.delete_at pos 
     406                eoa -= 1 
     407                pos -= 1 
     408                previous_combining_class = -1 
     409              else 
     410                previous_combining_class = current.combining_class 
     411              end 
     412            else 
     413              previous_combining_class = current.combining_class 
     414            end 
     415            if current.combining_class == 0 
     416              starter_pos = pos 
     417              starter_char = codepoints[pos] 
     418            end 
     419          end 
     420        end 
     421        codepoints 
     422      end 
     423    end 
     424  end 
     425end 
  • activesupport/lib/active_support/multibyte/handlers/passthru_handler.rb

    old new  
     1# Chars uses this handler when $KCODE is not set to 'UTF8'. Because this handler doesn't define any methods all call 
     2# will be forwarded to String. 
     3class ActiveSupport::Multibyte::Handlers::PassthruHandler 
     4   
     5  # Return the original byteoffset 
     6  def self.translate_offset(string, byte_offset) #:nodoc: 
     7    byte_offset 
     8  end 
     9end 
  • activesupport/lib/active_support/multibyte/handlers/utf8_handler_proc.rb

    old new  
     1# Methods in this handler call functions in the utf8proc ruby extension. These are significantly faster than the 
     2# pure ruby versions. Chars automatically uses this handler when it can load the utf8proc extension. For 
     3# documentation on handler methods see UTF8Handler. 
     4class ActiveSupport::Multibyte::Handlers::UTF8HandlerProc < ActiveSupport::Multibyte::Handlers::UTF8Handler 
     5   
     6  class << self 
     7    def normalize(str, form=ActiveSupport::Multibyte::DEFAULT_NORMALIZATION_FORM) #:nodoc: 
     8      codepoints = str.unpack('U*') 
     9      case form 
     10        when :d 
     11          utf8map(str, :stable) 
     12        when :c 
     13          utf8map(str, :stable, :compose) 
     14        when :kd 
     15          utf8map(str, :stable, :compat) 
     16        when :kc 
     17          utf8map(str, :stable, :compose, :compat) 
     18        else 
     19          raise ArgumentError, "#{form} is not a valid normalization variant", caller 
     20      end 
     21    end 
     22     
     23    def decompose(str) #:nodoc: 
     24      utf8map(str, :stable) 
     25    end 
     26     
     27    def downcase(str) #:nodoc:c 
     28      utf8map(str, :casefold) 
     29    end 
     30     
     31    protected 
     32     
     33    def utf8map(str, *option_array) #:nodoc: 
     34      options = 0 
     35      option_array.each do |option| 
     36        flag = Utf8Proc::Options[option] 
     37        raise ArgumentError, "Unknown argument given to utf8map." unless 
     38          flag 
     39        options |= flag 
     40      end 
     41      return Utf8Proc::utf8map(str, options) 
     42    end 
     43  end 
     44end 
  • activesupport/lib/active_support/multibyte/chars.rb

    old new  
     1require 'active_support/multibyte/handlers/utf8_handler' 
     2require 'active_support/multibyte/handlers/passthru_handler' 
     3 
     4# Encapsulates all the functionality related to the Chars proxy. 
     5module ActiveSupport::Multibyte 
     6  # Chars enables you to work transparently with multibyte encodings in the Ruby String class without having extensive 
     7  # knowledge about the encoding. A Chars object accepts a string upon initialization and proxies String methods in an 
     8  # encoding safe manner. All the normal String methods are also implemented on the proxy. 
     9  # 
     10  # String methods are proxied through the Chars object, and can be accessed through the +chars+ method. Methods 
     11  # which would normally return a String object now return a Chars object so methods can be chained. 
     12  # 
     13  #   "The Perfect String  ".chars.downcase.strip.normalize #=> "the perfect string" 
     14  # 
     15  # Chars objects are perfectly interchangeable with String objects as long as no explicit class checks are made. 
     16  # If certain methods do explicitly check the class, call +to_s+ before you pass chars objects to them. 
     17  # 
     18  #   bad.explicit_checking_method "T".chars.downcase.to_s 
     19  # 
     20  # The actual operations on the string are delegated to handlers. Theoretically handlers can be implemented for 
     21  # any encoding, but the default handler handles UTF-8. This handler is set during initialization, if you want to 
     22  # use you own handler, you can set it on the Chars class. Look at the UTF8Handler source for an example how to 
     23  # implement your own handler. If you your own handler to work on anything but UTF-8 you probably also 
     24  # want to override Chars#handler. 
     25  # 
     26  #   ActiveSupport::Multibyte::Chars.handler = MyHandler 
     27  # 
     28  # Note that a few methods are defined on Chars instead of the handler because they are defined on Object or Kernel 
     29  # and method_missing can't catch them. 
     30  class Chars 
     31     
     32    attr_reader :string # The contained string 
     33    alias_method :to_s, :string 
     34     
     35    include Comparable 
     36     
     37    # The magic method to make String and Chars comparable 
     38    def to_str 
     39      # Using any other ways of overriding the String itself will lead you all the way from infinite loops to 
     40      # core dumps. Don't go there. 
     41      @string 
     42    end 
     43     
     44    # Create a new Chars instance. 
     45    def initialize(str) 
     46      @string = (str.string rescue str) 
     47    end 
     48     
     49    # Returns -1, 0 or +1 depending on whether the Chars object is to be sorted before, equal or after the 
     50    # object on the right side of the operation. It accepts any object that implements +to_s+. See String.<=> 
     51    # for more details. 
     52    def <=>(other); @string <=> other.to_s; end 
     53     
     54    # Works just like String#split, with the exception that the items in the resulting list are Chars 
     55    # instances instead of String. This makes chaining methods easier. 
     56    def split(*args) 
     57      @string.split(*args).map { |i| i.chars } 
     58    end 
     59     
     60    # Gsub works exactly the same as gsub on a normal string. 
     61    def gsub(*a, &b); @string.gsub(*a, &b).chars; end 
     62     
     63    # Like String.=~ only it returns the character offset (in codepoints) instead of the byte offset. 
     64    def =~(other) 
     65      handler.translate_offset(@string, @string =~ other) 
     66    end 
     67     
     68    # Try to forward all undefined methods to the handler, when a method is not defined on the handler, send it to 
     69    # the contained string. Method_missing is also responsible for making the bang! methods destructive. 
     70    def method_missing(m, *a, &b) 
     71      begin 
     72        # Simulate methods with a ! at the end because we can't touch the enclosed string from the handlers. 
     73        if m.to_s =~ /^(.*)\!$/ 
     74          result = handler.send($1, @string, *a, &b) 
     75          if result == @string 
     76            result = nil 
     77          else 
     78            @string.replace result 
     79          end 
     80        else 
     81          result = handler.send(m, @string, *a, &b) 
     82        end 
     83      rescue NoMethodError 
     84        result = @string.send(m, *a, &b) 
     85      rescue Handlers::EncodingError 
     86        @string.replace handler.tidy_bytes(@string) 
     87        retry 
     88      end 
     89       
     90      if result.kind_of?(String) 
     91        result.chars 
     92      else 
     93        result 
     94      end 
     95    end 
     96     
     97    # Set the handler class for the Char objects. 
     98    def self.handler=(klass) 
     99      @@handler = klass 
     100    end 
     101 
     102    # Returns the proper handler for the contained string depending on $KCODE and the encoding of the string. This 
     103    # method is used internally to always redirect messages to the proper classes depending on the context. 
     104    def handler 
     105      if utf8_pragma? 
     106        @@handler 
     107      else 
     108        ActiveSupport::Multibyte::Handlers::PassthruHandler 
     109      end 
     110    end 
     111 
     112    private 
     113       
     114      # +utf8_pragma+ checks if it can send this string to the handlers. It makes sure @string isn't nil and $KCODE is 
     115      # set to 'UTF8'. 
     116      def utf8_pragma? 
     117        !@string.nil? && ($KCODE == 'UTF8') 
     118      end 
     119  end 
     120end 
     121 
     122# When we can load the utf8proc library, override normalization with the faster methods 
     123begin 
     124  require_library_or_gem('utf8proc_native') 
     125  require 'active_support/multibyte/handlers/utf8_handler_proc' 
     126  ActiveSupport::Multibyte::Chars.handler = ActiveSupport::Multibyte::Handlers::UTF8HandlerProc 
     127rescue LoadError 
     128  ActiveSupport::Multibyte::Chars.handler = ActiveSupport::Multibyte::Handlers::UTF8Handler 
     129end 
  • activesupport/lib/active_support/multibyte/generators/generate_tables.rb

    old new  
     1#!/usr/bin/env ruby 
     2 
     3require File.dirname(__FILE__) + '/../../../active_support' 
     4require 'open-uri' 
     5 
     6module ActiveSupport::Multibyte::Handlers 
     7  class UnicodeTableGenerator #:nodoc: 
     8    SOURCES = { 
     9      :codepoints => 'http://www.unicode.org/Public/5.0.0/ucd/UnicodeData.txt', 
     10      :composition_exclusion => 'http://www.unicode.org/Public/5.0.0/ucd/CompositionExclusions.txt', 
     11      :grapheme_break_property => 'http://www.unicode.org/Public/5.0.0/ucd/auxiliary/GraphemeBreakProperty.txt' 
     12    } 
     13     
     14    def initialize 
     15      @ucd = UnicodeDatabase.new 
     16       
     17      default = Codepoint.new 
     18      default.combining_class = 0 
     19      default.uppercase_mapping = 0 
     20      default.lowercase_mapping = 0 
     21      @ucd.codepoints = Hash.new(default) 
     22       
     23      @ucd.composition_exclusion = [] 
     24      @ucd.composition_map = {} 
     25      @ucd.boundary = {} 
     26    end 
     27     
     28    def parse_codepoints(line) 
     29      codepoint = Codepoint.new 
     30      raise "Could not parse input." unless line =~ /^ 
     31        ([0-9A-F]+);        # code 
     32        ([^;]+);            # name 
     33        ([A-Z]+);           # general category 
     34        ([0-9]+);           # canonical combining class 
     35        ([A-Z]+);           # bidi class 
     36        (<([A-Z]*)>)?       # decomposition type 
     37        ((\ ?[0-9A-F]+)*);  # decompomposition mapping 
     38        ([0-9]*);           # decimal digit 
     39        ([0-9]*);           # digit 
     40        ([^;]*);            # numeric 
     41        ([YN]*);            # bidi mirrored 
     42        ([^;]*);            # unicode 1.0 name 
     43        ([^;]*);            # iso comment 
     44        ([0-9A-F]*);        # simple uppercase mapping 
     45        ([0-9A-F]*);        # simple lowercase mapping 
     46        ([0-9A-F]*)$/ix     # simple titlecase mapping 
     47      codepoint.code              = $1.hex 
     48      #codepoint.name              = $2 
     49      #codepoint.category          = $3 
     50      codepoint.combining_class   = Integer($4) 
     51      #codepoint.bidi_class        = $5 
     52      codepoint.decomp_type       = $7 
     53      codepoint.decomp_mapping    = ($8=='') ? nil : $8.split.collect { |element| element.hex } 
     54      #codepoint.bidi_mirrored     = ($13=='Y') ? true : false 
     55      codepoint.uppercase_mapping = ($16=='') ? 0 : $16.hex 
     56      codepoint.lowercase_mapping = ($17=='') ? 0 : $17.hex 
     57      #codepoint.titlecase_mapping = ($18=='') ? nil : $18.hex 
     58      @ucd.codepoints[codepoint.code] = codepoint 
     59    end 
     60 
     61    def parse_grapheme_break_property(line) 
     62      if line =~ /^([0-9A-F\.]+)\s*;\s*([\w]+)\s*#/ 
     63        type = $2.downcase.intern 
     64        @ucd.boundary[type] ||= [] 
     65        if $1.include? '..' 
     66          parts = $1.split '..' 
     67          @ucd.boundary[type] << (parts[0].hex..parts[1].hex) 
     68        else 
     69          @ucd.boundary[type] << $1.hex 
     70        end 
     71      end 
     72    end 
     73     
     74    def parse_composition_exclusion(line) 
     75      if line =~ /^([0-9A-F]+)/i 
     76        @ucd.composition_exclusion << $1.hex 
     77      end 
     78    end 
     79     
     80    def create_composition_map 
     81      @ucd.codepoints.each do |_, cp| 
     82        if !cp.nil? and cp.combining_class == 0 and cp.decomp_type.nil? and !cp.decomp_mapping.nil? and cp.decomp_mapping.length == 2 and @ucd[cp.decomp_mapping[0]].combining_class == 0 and !@ucd.composition_exclusion.include?(cp.code) 
     83          @ucd.composition_map[cp.decomp_mapping[0]] ||= {} 
     84          @ucd.composition_map[cp.decomp_mapping[0]][cp.decomp_mapping[1]] = cp.code 
     85        end 
     86      end 
     87    end 
     88 
     89    def normalize_boundary_map 
     90      @ucd.boundary.each do |k,v| 
     91        if [:lf, :cr].include? k 
     92          @ucd.boundary[k] = v[0] 
     93        end 
     94      end 
     95    end 
     96   
     97    def parse 
     98      SOURCES.each do |type, url| 
     99        filename =  "/tmp/#{url.split('/').last}" 
     100        unless File.exist?(filename) 
     101          $stderr.puts "Downloading #{url.split('/').last}" 
     102          File.open(filename, 'w') do |target| 
     103            open(url) do |source| 
     104              source.each_line { |line| target.write line } 
     105            end 
     106          end 
     107        end 
     108        File.open(filename) do |file| 
     109          file.each_line { |line| send "parse_#{type}".intern, line } 
     110        end         
     111      end 
     112      create_composition_map 
     113      normalize_boundary_map 
     114    end 
     115     
     116    def dump_to(filename) 
     117      File.open(filename, 'w') do |f| 
     118        f.write Marshal.dump([@ucd.codepoints, @ucd.composition_exclusion, @ucd.composition_map, @ucd.boundary]) 
     119      end 
     120    end 
     121  end 
     122end 
     123 
     124if __FILE__ == $0 
     125  filename = ActiveSupport::Multibyte::Handlers::UnicodeDatabase.filename 
     126  generator = ActiveSupport::Multibyte::Handlers::UnicodeTableGenerator.new 
     127  generator.parse 
     128  print "Writing to: #{filename}" 
     129  generator.dump_to filename 
     130  puts " (#{File.size(filename)} bytes)" 
     131end 
  • activesupport/lib/active_support.rb

    old new  
    4040require 'active_support/values/time_zone' 
    4141 
    4242require 'active_support/json' 
     43 
     44require 'active_support/multibyte'