admin管理员组

文章数量:1441952

中文分词代码(此代码为作者多年经验总结,以前发表过VB,PB版本)

/*  * created by yzh 2004.5.12  * 请大家引用时保留这段作者声明,此代码为开源代码;使用不受限制。  * 中文分词代码  *此代码为作者多年经验总结,以前发表过VB,PB版本 */

import java.io.BufferedReader; import java.io.IOException; import java.io.InputStream; import java.io.InputStreamReader; import java.util.Locale; import java.util.TreeMap; import java.util.TreeSet;

public class ChineseSegmenter {

   private static ChineseSegmenter segmenter = null;

   // private Hashtable zhwords;    private TreeMap zhwords;

   private TreeSet cforeign, cnumbers;

   // Char form    public final static int TRAD = 0;

   public final static int SIMP = 1;

   public final static int BOTH = 2;

   // Charform is TRAD, SIMP or BOTH    private ChineseSegmenter(int charform, boolean loadwordfile) {       cforeign = new TreeSet();       cnumbers = new TreeSet();

      if (charform == SIMP) {          loadset(cnumbers, "data/snumbers_u8.txt");          loadset(cforeign, "data/sforeign_u8.txt");       } else if (charform == TRAD) {          loadset(cnumbers, "data/tnumbers_u8.txt");          loadset(cforeign, "data/tforeign_u8.txt");       } else { // BOTH          loadset(cnumbers, "data/snumbers_u8.txt");          loadset(cforeign, "data/sforeign_u8.txt");          loadset(cnumbers, "data/tnumbers_u8.txt");          loadset(cforeign, "data/tforeign_u8.txt");       }

      // zhwords = new Hashtable(120000);       zhwords = new TreeMap();

      if (!loadwordfile) {          return;       }

      String newword = null;       try {          InputStream worddata = null;          if (charform == SIMP) {             worddata = getClass().getResourceAsStream("simplexu8.txt");          } else if (charform == TRAD) {             worddata = getClass().getResourceAsStream("tradlexu8.txt");          } else if (charform == BOTH) {             worddata = getClass().getResourceAsStream("bothlexu8.txt");          }          BufferedReader in = new BufferedReader(new InputStreamReader(                worddata, "UTF8"));          while ((newword = in.readLine()) != null) {             if ((newword.indexOf("#") == -1) && (newword.length() < 5)) {

               zhwords.put(newword.intern(), "1");

               if (newword.length() == 3) {                   if (zhwords.containsKey(newword.substring(0, 2)                         .intern()) == false) {                      zhwords.put(newword.substring(0, 2).intern(), "2");                   }                }

               if (newword.length() == 4) {                   if (zhwords.containsKey(newword.substring(0, 2)                         .intern()) == false) {                      zhwords.put(newword.substring(0, 2).intern(), "2");                   }                   if (zhwords.containsKey(newword.substring(0, 3)                         .intern()) == false) {                      zhwords.put(newword.substring(0, 3).intern(), "2");                   }                }             }          }          in.close();       } catch (IOException e) {          e.printStackTrace();       }

   }    public synchronized static void reset() {       ChineseSegmenter.segmenter = null;    }

   public synchronized static ChineseSegmenter getGBSegmenter() {       Locale.setDefault(Locale.SIMPLIFIED_CHINESE);       if (ChineseSegmenter.segmenter == null) {          ChineseSegmenter.segmenter = new ChineseSegmenter(ChineseSegmenter.SIMP, true);       }       return ChineseSegmenter.segmenter;    }

   public synchronized static ChineseSegmenter getBig5Segmenter() {       Locale.setDefault(Locale.TRADITIONAL_CHINESE);       if (ChineseSegmenter.segmenter == null) {          ChineseSegmenter.segmenter = new ChineseSegmenter(ChineseSegmenter.TRAD, true);       }       return ChineseSegmenter.segmenter;    }

   private void loadset(TreeSet targetset, String sourcefile) {       String dataline;       try {          InputStream setdata = getClass().getResourceAsStream(sourcefile);          BufferedReader in = new BufferedReader(new InputStreamReader(                setdata, "UTF-8"));          while ((dataline = in.readLine()) != null) {             if ((dataline.indexOf("#") > -1) || (dataline.length() == 0)) {                continue;             }             targetset.add(dataline.intern());          }          in.close();       } catch (Exception e) {          System.err.println("Exception loading data file" + sourcefile + " "                + e);          e.printStackTrace();       }

   }

   public boolean isNumber(String testword) {       boolean result = true;       for (int i = 0; i < testword.length(); i++) {          if (cnumbers.contains(testword.substring(i, i + 1).intern()) == false) {             result = false;             break;          }       }       return result;    }

   public boolean isAllForeign(String testword) {       boolean result = true;       for (int i = 0; i < testword.length(); i++) {          if (cforeign.contains(testword.substring(i, i + 1).intern()) == false) {             result = false;             break;          }       }

      return result;    }

   public boolean isNotCJK(String testword) {       boolean result = true;       for (int i = 0; i < testword.length(); i++) {          if (Character.UnicodeBlock.of(testword.charAt(i)) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS) {             result = false;             break;          }       }

      return result;    }

   public String segmentLine(String cline, String separator) {       StringBuffer currentword = new StringBuffer();       StringBuffer outline = new StringBuffer();       int i, clength;       char currentchar;       // separator = " ";

      clength = cline.length();       for (i = 0; i < clength; i++) {          currentchar = cline.charAt(i);          if (Character.UnicodeBlock.of(currentchar) == Character.UnicodeBlock.CJK_UNIFIED_IDEOGRAPHS                || isNumber(cline.substring(i, i + 1)) == true) {             // Character in CJK block             if (currentword.length() == 0) { // start looking for next                                        // word                if (i > 0                      && (Character.isWhitespace(cline.charAt(i - 1)) == false)) {                   outline.append(separator);                }                currentword.append(currentchar);

            } else {                if (zhwords.containsKey(new String(currentword.toString()                      + currentchar).intern()) == true                      && ((String) (zhwords.get(new String(currentword                            .toString()                            + currentchar).intern()))).equals("1") == true) {                   // word is in lexicon                   currentword.append(currentchar);                } else if (isAllForeign(currentword.toString())                      && cforeign.contains(new String(                            new char[] { currentchar }).intern())                      && i + 2 < clength                      && (zhwords.containsKey(cline.substring(i, i + 2)                            .intern()) == false)) {                   // Possible a transliteration of a foreign name                   currentword.append(currentchar);                } else if (isNumber(currentword.toString())                      && cnumbers.contains(new String(                            new char[] { currentchar }).intern())                /*                 * && (i + 2 < clength) &&                 * (zhwords.containsKey(cline.substring(i, i+2).intern()) ==                 * false)                 */) {                   // Put all consecutive number characters together                   currentword.append(currentchar);                } else if ((zhwords.containsKey(new String(currentword                      .toString()                      + currentchar).intern()))                      && (((String) (zhwords.get(new String(currentword                            .toString()                            + currentchar).intern()))).equals("2") == true)                      && i + 1 < clength                      && (zhwords.containsKey(new String(currentword                            .toString()                            + currentchar + cline.charAt(i + 1))                            .intern()) == true)) {                   // Starts a word in the lexicon                   currentword.append(currentchar);

               } else { // Start anew                      outline.append(currentword.toString());                   if (Character.isWhitespace(currentchar) == false) {                      outline.append(separator);                   }                   currentword.setLength(0);                   currentword.append(currentchar);                }             }

         } else { // Not chinese character             // System.err.println("not cjk");             if (currentword.length() > 0) {                outline.append(currentword.toString());                if (Character.isWhitespace(currentchar) == false) {                   outline.append(separator);                }                currentword.setLength(0);             }             outline.append(currentchar);          }       }

      outline.append(currentword.toString());

      return outline.toString();       // return offsets;    }

   public static void main(String[] args) throws Exception {

      ChineseSegmenter seg = ChineseSegmenter.getGBSegmenter();       System.out.println(seg.segmentLine("Some string in chinese.", " "));    }

}

本文参与 腾讯云自媒体同步曝光计划,分享自作者个人站点/博客。 原始发表:2006-03-24,如有侵权请联系 cloudcommunity@tencent 删除publicstringtxt中文分词append

本文标签: 中文分词代码(此代码为作者多年经验总结,以前发表过VB,PB版本)