class StringPrepTablesGenerator

用于生成 stringprep 正则表达式的生成器。

将 Unicode 字符类与生成的表结合使用。生成的正则表达式仍用于测试编写的正则表达式是否符合规范。某些表与 ruby 的正则表达式引擎可用的任何字符属性都不太匹配。这些使用表生成的正则表达式。

常量

SASL_TABLES_PROHIBITED
SASL_TABLES_PROHIBITED_STORED
STRINGPREP_JSON_FILE
STRINGPREP_RFC_FILE
SURROGATES_RANGE

有效的 UTF-8 不能包含这些代码点,无论如何都要检查它们,使用 /p{Cs}/ 😉

属性

json_filename[R]
rfc_filename[R]

公共类方法

new(rfc_filename: STRINGPREP_RFC_FILE, json_filename: STRINGPREP_JSON_FILE) 点击切换源代码
# File net-imap-0.5.4/rakelib/string_prep_tables_generator.rb, line 21
def initialize(rfc_filename:  STRINGPREP_RFC_FILE,
               json_filename: STRINGPREP_JSON_FILE)
  @rfc_filename  = rfc_filename
  @json_filename = json_filename
end

公共实例方法

arrays() 点击切换源代码
# File net-imap-0.5.4/rakelib/string_prep_tables_generator.rb, line 44
def arrays;  @arrays  ||= ranges.transform_values{|t| t.flat_map(&:to_a) } end
asgn_regexps() 点击切换源代码
# File net-imap-0.5.4/rakelib/string_prep_tables_generator.rb, line 47
def asgn_regexps; @asgn_regexps || asgn_regexps! end
clean_deps() 点击切换源代码
# File net-imap-0.5.4/rakelib/string_prep_tables_generator.rb, line 30
def clean_deps; Rake::FileList.new           STRINGPREP_JSON_FILE end
generate_json_data_file() 点击切换源代码
# File net-imap-0.5.4/rakelib/string_prep_tables_generator.rb, line 32
def generate_json_data_file
  require "json"
  rfc_filename
    .then(&File.method(:read))
    .then(&method(:parse_rfc_text))
    .then(&JSON.method(:pretty_generate))
    .then {|data| File.write json_filename, data }
end
json_deps() 点击切换源代码

用于 rake 依赖项

# File net-imap-0.5.4/rakelib/string_prep_tables_generator.rb, line 28
def json_deps;  Rake::FileList.new __FILE__, STRINGPREP_RFC_FILE  end
merged_tables_regex(*table_names, negate: false) 点击切换源代码
# File net-imap-0.5.4/rakelib/string_prep_tables_generator.rb, line 49
def merged_tables_regex(*table_names, negate: false)
  table_names
    .flat_map(&arrays.method(:fetch))
    .then {|array| to_regexp(array, negate: negate) }
end
ranges() 点击切换源代码
# File net-imap-0.5.4/rakelib/string_prep_tables_generator.rb, line 43
def ranges;  @ranges  ||= tables.transform_values(&method(:to_ranges)) end
rb_deps() 点击切换源代码
# File net-imap-0.5.4/rakelib/string_prep_tables_generator.rb, line 29
def rb_deps;    Rake::FileList.new __FILE__, STRINGPREP_JSON_FILE end
regexp_for(*names, negate: false) 点击切换源代码
# File net-imap-0.5.4/rakelib/string_prep_tables_generator.rb, line 55
def regexp_for(*names, negate: false)
  asgn_regexps[[*names, negate]] ||= merged_tables_regex(*names, negate: negate)
end
regexps() 点击切换源代码
# File net-imap-0.5.4/rakelib/string_prep_tables_generator.rb, line 46
def regexps; @regexps ||= arrays.transform_values(&method(:to_regexp)) end
saslprep_rb() 点击切换源代码
# File net-imap-0.5.4/rakelib/string_prep_tables_generator.rb, line 168
  def saslprep_rb
    <<~RUBY
      # frozen_string_literal: true

      #--
      # This file is generated from RFC3454, by rake.  Don't edit directly.
      #++

      module Net::IMAP::StringPrep

        module SASLprep

          # RFC4013 §2.1 Mapping - mapped to space
          # >>>
          #   non-ASCII space characters (\\StringPrep\\[\\"C.1.2\\"]) that can
          #   be mapped to SPACE (U+0020)
          #
          # Equal to \\StringPrep\\[\\"C.1.2\\"].
          # Redefined here to avoid loading StringPrep::Tables unless necessary.
          MAP_TO_SPACE = #{regex_str "C.1.2"}

          # RFC4013 §2.1 Mapping - mapped to nothing
          # >>>
          #   the "commonly mapped to nothing" characters
          #   (\\StringPrep\\[\\"B.1\\"]) that can be mapped to nothing.
          #
          # Equal to \\StringPrep\\[\\"B.1\\"].
          # Redefined here to avoid loading StringPrep::Tables unless necessary.
          MAP_TO_NOTHING = #{regex_str "B.1"}

          # RFC4013 §2.3 Prohibited Output
          # >>>
          # * Non-ASCII space characters — \\StringPrep\\[\\"C.1.2\\"]
          # * ASCII control characters — \\StringPrep\\[\\"C.2.1\\"]
          # * Non-ASCII control characters — \\StringPrep\\[\\"C.2.2\\"]
          # * Private Use characters — \\StringPrep\\[\\"C.3\\"]
          # * Non-character code points — \\StringPrep\\[\\"C.4\\"]
          # * Surrogate code points — \\StringPrep\\[\\"C.5\\"]
          # * Inappropriate for plain text characters — \\StringPrep\\[\\"C.6\\"]
          # * Inappropriate for canonical representation characters — \\StringPrep\\[\\"C.7\\"]
          # * Change display properties or deprecated characters — \\StringPrep\\[\\"C.8\\"]
          # * Tagging characters — \\StringPrep\\[\\"C.9\\"]
          TABLES_PROHIBITED = #{SASL_TABLES_PROHIBITED.inspect}.freeze

          # Adds unassigned (by Unicode 3.2) codepoints to TABLES_PROHIBITED.
          #
          # RFC4013 §2.5 Unassigned Code Points
          # >>>
          #   This profile specifies the \\StringPrep\\[\\"A.1\\"] table as its
          #   list of unassigned code points.
          TABLES_PROHIBITED_STORED = ["A.1", *TABLES_PROHIBITED].freeze

          # A Regexp matching codepoints prohibited by RFC4013 §2.3.
          #
          # This combines all of the TABLES_PROHIBITED tables.
          PROHIBITED_OUTPUT = #{regex_str(*SASL_TABLES_PROHIBITED)}

          # RFC4013 §2.5 Unassigned Code Points
          # >>>
          #   This profile specifies the \\StringPrep\\[\\"A.1\\"] table as its
          #   list of unassigned code points.
          #
          # Equal to \\StringPrep\\[\\"A.1\\"].
          # Redefined here to avoid loading StringPrep::Tables unless necessary.
          UNASSIGNED = #{regex_str "A.1"}

          # A Regexp matching codepoints prohibited by RFC4013 §2.3 and §2.5.
          #
          # This combines PROHIBITED_OUTPUT and UNASSIGNED.
          PROHIBITED_OUTPUT_STORED = Regexp.union(
            UNASSIGNED, PROHIBITED_OUTPUT
          ).freeze

          # Bidirectional Characters [StringPrep, §6]
          #
          # A Regexp for strings that don't satisfy StringPrep's Bidirectional
          # Characters rules.
          #
          # Equal to StringPrep::Tables::BIDI_FAILURE.
          # Redefined here to avoid loading StringPrep::Tables unless necessary.
          BIDI_FAILURE = #{bidi_failure_regexp.inspect}.freeze

          # A Regexp matching strings prohibited by RFC4013 §2.3 and §2.4.
          #
          # This combines PROHIBITED_OUTPUT and BIDI_FAILURE.
          PROHIBITED = Regexp.union(
            PROHIBITED_OUTPUT, BIDI_FAILURE,
          )

          # A Regexp matching strings prohibited by RFC4013 §2.3, §2.4, and §2.5.
          #
          # This combines PROHIBITED_OUTPUT_STORED and BIDI_FAILURE.
          PROHIBITED_STORED = Regexp.union(
            PROHIBITED_OUTPUT_STORED, BIDI_FAILURE,
          )

        end
      end
    RUBY
  end
sets() 点击切换源代码
# File net-imap-0.5.4/rakelib/string_prep_tables_generator.rb, line 45
def sets;    @sets    ||= arrays.transform_values(&:to_set) end
stringprep_rb() 点击切换源代码
# File net-imap-0.5.4/rakelib/string_prep_tables_generator.rb, line 59
  def stringprep_rb
    <<~RUBY
      # frozen_string_literal: true

      #--
      # This file is generated from RFC3454, by rake.  Don't edit directly.
      #++

      module Net::IMAP::StringPrep

        module Tables

          #{asgn_table "A.1"}

          #{asgn_table "B.1"}

          #{asgn_table "B.2"}

          #{asgn_table "B.3"}

          #{asgn_mapping "B.1", ""}

          #{asgn_mapping "B.2"}

          #{asgn_mapping "B.3"}

          #{asgn_table "C.1.1"}

          #{asgn_table "C.1.2"}

          #{asgn_table "C.2.1"}

          #{asgn_table "C.2.2"}

          #{asgn_table "C.3"}

          #{asgn_table "C.4"}

          #{asgn_table "C.5"}

          #{asgn_table "C.6"}

          #{asgn_table "C.7"}

          #{asgn_table "C.8"}

          #{asgn_table "C.9"}

          #{asgn_table "D.1"}

          # Used to check req3 of bidirectional checks
          #{asgn_table "D.1", negate: true}

          #{asgn_table "D.2"}

          BIDI_DESC_REQ2 = "A string with RandALCat characters must not contain LCat characters."

          # Bidirectional Characters [StringPrep, §6], Requirement 2
          # >>>
          #   If a string contains any RandALCat character, the string MUST NOT
          #   contain any LCat character.
          BIDI_FAILS_REQ2 = #{bidi_fails_req2.inspect}.freeze

          BIDI_DESC_REQ3 = "A string with RandALCat characters must start and end with RandALCat characters."

          # Bidirectional Characters [StringPrep, §6], Requirement 3
          # >>>
          #   If a string contains any RandALCat character, a RandALCat
          #   character MUST be the first character of the string, and a
          #   RandALCat character MUST be the last character of the string.
          BIDI_FAILS_REQ3 = #{bidi_fails_req3.inspect}.freeze

          # Bidirectional Characters [StringPrep, §6]
          BIDI_FAILURE = #{bidi_failure_regexp.inspect}.freeze

          # Names of each codepoint table in the RFC-3454 appendices
          TITLES = {
            #{table_titles_rb}
          }.freeze

          # Regexps matching each codepoint table in the RFC-3454 appendices
          REGEXPS = {
            #{table_regexps_rb}
          }.freeze

          MAPPINGS = {
            "B.1" => [IN_B_1, MAP_B_1].freeze,
            "B.2" => [IN_B_2, MAP_B_2].freeze,
            "B.3" => [IN_B_3, MAP_B_3].freeze,
          }.freeze

        end
      end
    RUBY
  end
table_regexps_rb(indent = 3) 点击切换源代码
# File net-imap-0.5.4/rakelib/string_prep_tables_generator.rb, line 161
def table_regexps_rb(indent = 3)
  asgn_regexps # => { ["A.1", false] => regexp, ... }
    .reject {|(_, n), _| n }
    .map {|(t, _), _| "%p => %s," % [t, regexp_const_name(t)] }
    .join("\n#{"  "*indent}")
end
table_titles_rb(indent = 3) 点击切换源代码
# File net-imap-0.5.4/rakelib/string_prep_tables_generator.rb, line 155
def table_titles_rb(indent = 3)
  titles
    .map{|t| "%p => %p," % t }
    .join("\n#{"  "*indent}")
end
tables() 点击切换源代码
# File net-imap-0.5.4/rakelib/string_prep_tables_generator.rb, line 41
def tables;  @tables  ||= load_tables_and_titles_from_json!.first end
titles() 点击切换源代码
# File net-imap-0.5.4/rakelib/string_prep_tables_generator.rb, line 42
def titles;  @titles  ||= load_tables_and_titles_from_json!.last end

私有实例方法

asgn_mapping(name, replacement = to_map(tables[name])) 点击切换源代码
# File net-imap-0.5.4/rakelib/string_prep_tables_generator.rb, line 390
def asgn_mapping(name, replacement = to_map(tables[name]))
  cname = name.tr(?., ?_).upcase
  "# Replacements for %s\n%s%s = %p.freeze" % [
    "IN_#{name}", "  " * 2, "MAP_#{cname}", replacement,
  ]
end
asgn_regex(name, regexp, negate: false) 点击切换源代码
# File net-imap-0.5.4/rakelib/string_prep_tables_generator.rb, line 407
def asgn_regex(name, regexp, negate: false)
  asgn_regexps[[name, negate]] = regexp
  "# %s\n%s%s = %p.freeze" % [
    regexp_const_desc(name, negate: negate), " " * 4,
    regexp_const_name(name, negate: negate),
    regexp,
  ]
end
asgn_regexps!() 点击切换源代码
# File net-imap-0.5.4/rakelib/string_prep_tables_generator.rb, line 344
def asgn_regexps!
  @asgn_regexps = {}
  # preset the regexp for each table
  asgn_regex "A.1", /\p{^AGE=3.2}/
  # If ruby supported all unicode properties (i.e. line break = word joiner):
  #   /[\u{00ad 034f 1806}\p{join_c}\p{VS}\p{lb=WJ}&&\p{age=3.2}]/
  asgn_table "B.1"
  asgn_table "B.2"
  asgn_table "B.3"
  asgn_regex "C.1.1", / /
  asgn_regex "C.1.2", /[\u200b\p{Zs}&&[^ ]]/
  asgn_regex "C.2.1", /[\x00-\x1f\x7f]/
  # C.2.2 is a union:
  #   Cc + Cf (as defined by Unicode 3.2) + Zl + Zp + 0xfffc
  #   - any codepoints covered by C.2.1 or C.8 or C.9
  #
  # But modern Unicode properties are significantly different, so it's better
  # to just load the table definition.
  asgn_table "C.2.2"
  asgn_regex "C.3", /\p{private use}/
  asgn_regex "C.4", /\p{noncharacter code point}/
  asgn_regex "C.5", /\p{surrogate}/
  asgn_regex "C.6", /[\p{in specials}&&\p{AGE=3.2}&&\p{^NChar}]/
  asgn_regex "C.7", /[\p{in ideographic description characters}&&\p{AGE=3.2}]/
  # C.8 is a union of \p{Bidi Control} and Unicode 3.2 properties.  But those properties
  # have changed for modern Unicode, and thus for modern ruby's regexp
  # character properties.  It's better to just load the table definition.
  asgn_table "C.8"
  asgn_regex "C.9", /[\p{in Tags}&&\p{AGE=3.2}]/
  # Unfortunately, ruby doesn't (currently) support /[\p{Bidi
  # Class=R}\p{bc=AL}]/.  On the other hand, StringPrep (based on Unicode 3.2)
  # might not be a good match for the modern (14.0) property value anyway.
  asgn_table "D.1"
  asgn_table "D.1", negate: true # used by BIDI_FAILS_REQ3
  asgn_table "D.2"
  @asgn_regexps
end
asgn_table(name, negate: false) 点击切换源代码
# File net-imap-0.5.4/rakelib/string_prep_tables_generator.rb, line 386
def asgn_table(name, negate: false)
  asgn_regex(name, regexp_for(name, negate: negate), negate: negate)
end
bidi_L() 点击切换源代码
# File net-imap-0.5.4/rakelib/string_prep_tables_generator.rb, line 418
def bidi_L        ; regexp_for "D.2" end
bidi_R_AL() 点击切换源代码
# File net-imap-0.5.4/rakelib/string_prep_tables_generator.rb, line 416
def bidi_R_AL     ; regexp_for "D.1" end
bidi_fails_req2() 点击切换源代码
# File net-imap-0.5.4/rakelib/string_prep_tables_generator.rb, line 420
def bidi_fails_req2
  Regexp.union(
    /#{bidi_R_AL}.*?#{bidi_L}/mu, # RandALCat followed by LCat
    /#{bidi_L}.*?#{bidi_R_AL}/mu, # RandALCat preceded by LCat
  )
end
bidi_fails_req3() 点击切换源代码
# File net-imap-0.5.4/rakelib/string_prep_tables_generator.rb, line 427
def bidi_fails_req3
  # contains RandALCat:
  Regexp.union(
    /\A#{bidi_not_R_AL}.*?#{bidi_R_AL}/mu, # but doesn't start with RandALCat
    /#{bidi_R_AL}.*?#{bidi_not_R_AL}\z/mu, # but doesn't end   with RandALCat
  )
end
bidi_failure_regexp() 点击切换源代码
# File net-imap-0.5.4/rakelib/string_prep_tables_generator.rb, line 435
def bidi_failure_regexp
  Regexp.union(bidi_fails_req2, bidi_fails_req3)
end
bidi_not_R_AL() 点击切换源代码
# File net-imap-0.5.4/rakelib/string_prep_tables_generator.rb, line 417
def bidi_not_R_AL ; regexp_for "D.1", negate: true end
load_tables_and_titles_from_json!() 点击切换源代码
# File net-imap-0.5.4/rakelib/string_prep_tables_generator.rb, line 298
def load_tables_and_titles_from_json!
  require "json"
  @tables = json_filename
    .then(&File.method(:read))
    .then(&JSON.method(:parse))
  @titles = @tables.delete "titles"
  [@tables, @titles]
end
parse_rfc_text(rfc3454_text) 点击切换源代码
# File net-imap-0.5.4/rakelib/string_prep_tables_generator.rb, line 271
def parse_rfc_text(rfc3454_text)
  titles = {}
  tables, = rfc3454_text
    .lines
    .each_with_object([]) {|line, acc|
      current, table = acc.last
      case line
      when /^([A-D]\.[1-9](?:\.[1-9])?) (.*)/
        titles[$1] = $2
      when /^ {3}-{5} Start Table (\S*)/
        acc << [$1, []]
      when /^ {3}-{5} End Table /
        acc << [nil, nil]
      when /^ {3}([0-9A-F]+); ([ 0-9A-F]*)(?:;[^;]*)$/  # mapping tables
        table << [$1, $2.split(/ +/)] if current
      when /^ {3}([-0-9A-F]+)(?:;[^;]*)?$/              # regular tables
        table << $1 if current
      when /^ {3}(.*)/
        raise "expected to match %p" % $1 if current
      end
    }
    .to_h.compact
    .transform_values {|t| t.first.size == 2 ? t.to_h : t }
  tables["titles"] = titles
  tables
end
regex_str(*names, negate: false) 点击切换源代码
# File net-imap-0.5.4/rakelib/string_prep_tables_generator.rb, line 382
def regex_str(*names, negate: false)
  "%p.freeze" % regexp_for(*names, negate: negate)
end
regexp_const_desc(name, negate: false) 点击切换源代码
# File net-imap-0.5.4/rakelib/string_prep_tables_generator.rb, line 397
def regexp_const_desc(name, negate: false)
  if negate then "Matches the negation of the %s table" % [name]
  else %q{%s \\StringPrep\\[\\"%s\\"]} % [titles.fetch(name), name]
  end
end
regexp_const_name(table_name, negate: false) 点击切换源代码
# File net-imap-0.5.4/rakelib/string_prep_tables_generator.rb, line 403
def regexp_const_name(table_name, negate: false)
  "IN_%s%s" % [table_name.tr(".", "_"), negate ? "_NEGATED" : ""]
end
to_map(table) 点击切换源代码

TODO: 与 unicode_normalize 一起进行 DRY

# File net-imap-0.5.4/rakelib/string_prep_tables_generator.rb, line 314
def to_map(table)
  table = table.to_hash
    .transform_keys { Integer _1, 16 }
    .transform_keys { [_1].pack("U*") }
    .transform_values {|cps| cps.map { Integer _1, 16 } }
    .transform_values { _1.pack("U*") }
end
to_ranges(table) 点击切换源代码
# File net-imap-0.5.4/rakelib/string_prep_tables_generator.rb, line 307
def to_ranges(table)
  (table.is_a?(Hash) ? table.keys : table)
    .map{|range| range.split(?-).map{|cp| Integer cp, 16} }
    .map{|s,e| s..(e || s)}
end
to_regexp(codepoints, negate: false) 点击切换源代码

从代码点数组(而不是范围)开始,以去除合并表中的重复项。

# File net-imap-0.5.4/rakelib/string_prep_tables_generator.rb, line 324
def to_regexp(codepoints, negate: false)
  codepoints
    .grep_v(SURROGATES_RANGE) # remove surrogate codepoints from C.5 and D.2
    .uniq
    .sort
    .chunk_while {|cp1,cp2| cp1 + 1 == cp2 }     # find contiguous chunks
    .map {|chunk| chunk.map{|cp| "%04x" % cp } } # convert to hex strings
    .partition {|chunk| chunk[1] }               # ranges vs singles
    .then {|ranges, singles|
      singles.flatten!
      [
        negate ? "^" : "",
        singles.flatten.any? ? "\\u{%s}" % singles.join(" ") : "",
        ranges.map {|r| "\\u{%s}-\\u{%s}" % [r.first, r.last] }.join,
        codepoints.any?(SURROGATES_RANGE) ? "\\p{Cs}" : "", # not necessary :)
      ].join
    }
    .then {|char_class| Regexp.new "[#{char_class}]" }
end