首页 > 代码库 > SegList(分词辅助类)
SegList(分词辅助类)
using System;
using System.Collections;
using System.IO;
using System.Text.RegularExpressions;
/// <summary>
/// 分词辅助类
/// </summary>
public class SegList
{
public int MaxLength;
private ArrayList m_seg;
public int Count
{
get
{
return m_seg.Count;
}
}
public SegList()
{
m_seg = new ArrayList();
MaxLength = 0;
}
public void Add(object obj)
{
m_seg.Add(obj);
if (MaxLength < obj.ToString().Length)
{
MaxLength = obj.ToString().Length;
}
}
public object GetElem(int i)
{
if (i < this.Count)
return m_seg[i];
else
return null;
}
public void SetElem(int i, object obj)
{
m_seg[i] = obj;
}
public bool Contains(object obj)
{
return m_seg.Contains(obj);
}
/// <summary>
/// 按长度排序
/// </summary>
public void Sort()
{
Sort(this);
}
/// <summary>
/// 按长度排序
/// </summary>
public void Sort(SegList list)
{
int max = 0;
for (int i = 0; i < list.Count - 1; ++i)
{
max = i;
for (int j = i + 1; j < list.Count; ++j)
{
string str1 = list.GetElem(j).ToString();
string str2 = list.GetElem(max).ToString();
int l1;
int l2;
if (str1 == "null")
l1 = 0;
else
l1 = str1.Length;
if (str2 == "null")
l2 = 0;
else
l2 = str2.Length;
if (l1 > l2)
max = j;
}
object o = list.GetElem(max);
list.SetElem(max, list.GetElem(i));
list.SetElem(i, o);
}
}
}
/// <summary>
/// 分词类
/// </summary>
//----------------调用----------------------
//Segment seg = new Segment();
//seg.InitWordDics();
//seg.EnablePrefix = true;
//seg.Separator =" ";
//seg.SegmentText("字符串", false).Trim();
//-------------------------------------------
public class Segment
{
#region 私有字段
private string m_DicPath = System.Web.HttpContext.Current.Server.MapPath("bin/ShootSeg/sDict.dic");
private string m_NoisePath = System.Web.HttpContext.Current.Server.MapPath("bin/ShootSeg/sNoise.dic");
private string m_NumberPath = System.Web.HttpContext.Current.Server.MapPath("bin/ShootSeg/sNumber.dic");
private string m_WordPath = System.Web.HttpContext.Current.Server.MapPath("bin/ShootSeg/sWord.dic");
private string m_PrefixPath = System.Web.HttpContext.Current.Server.MapPath("bin/ShootSeg/sPrefix.dic");
private Hashtable htWords;
private ArrayList alNoise;
private ArrayList alNumber;
private ArrayList alWord;
private ArrayList alPrefix;
private double m_EventTime = 0;
/// <summary>
/// 分隔符
/// </summary>
private string m_Separator = " ";
/// <summary>
/// 用于验证汉字的正则表达式
/// </summary>
private string strChinese = "[\u4e00-\u9fa5]";
#endregion
#region 公有属性
/// <summary>
/// 基本词典路径
/// </summary>
public string DicPath
{
get
{
return m_DicPath;
}
set
{
m_DicPath = value;
}
}
/// <summary>
/// 数据缓存函数
/// </summary>
/// <param name="key">索引键</param>
/// <param name="val">缓存的数据</param>
private static void SetCache(string key, object val)
{
if (val == null) val = " ";
System.Web.HttpContext.Current.Application.Lock();
System.Web.HttpContext.Current.Application.Set(key, val);
System.Web.HttpContext.Current.Application.UnLock();
}
/// <summary>
/// 读取缓存
/// </summary>
private static object GetCache(string key)
{
return System.Web.HttpContext.Current.Application.Get(key);
}
/// <summary>
/// 暂时无用
/// </summary>
public string NoisePath
{
get
{
return m_NoisePath;
}
set
{
m_NoisePath = value;
}
}
/// <summary>
/// 数字词典路径
/// </summary>
public string NumberPath
{
get
{
return m_NumberPath;
}
set
{
m_NumberPath = value;
}
}
/// <summary>
/// 字母词典路径
/// </summary>
public string WordPath
{
get
{
return m_WordPath;
}
set
{
m_WordPath = value;
}
}
/// <summary>
/// 姓名前缀字典 用于纠错姓名
/// </summary>
public string PrefixPath
{
get
{
return m_PrefixPath;
}
set
{
m_PrefixPath = value;
}
}
/// <summary>
/// 是否开启姓名纠错功能
/// </summary>
public bool EnablePrefix
{
get
{
if (alPrefix.Count == 0)
return false;
else
return true;
}
set
{
if (value)
alPrefix = LoadWords(PrefixPath, alPrefix);
else
alPrefix = new ArrayList();
}
}
/// <summary>
/// 用时每次进行加载或分词动作后改属性表示为上一次动作所用时间
/// 已精确到毫秒但分词操作在字符串较短时可能为0
/// </summary>
public double EventTime
{
get
{
return m_EventTime;
}
}
/// <summary>
/// 分隔符,默认为空格
/// </summary>
public string Separator
{
get
{
return m_Separator;
}
set
{
if (value != "" && value != null) m_Separator = value;
}
}
#endregion
#region 构造方法
/// <summary>
/// 构造方法
/// </summary>
public Segment()
{ }
/// <summary>
/// 构造方法
/// </summary>
public Segment(string p_DicPath, string p_NoisePath, string p_NumberPath, string p_WordPath)
{
m_WordPath = p_DicPath;
m_WordPath = p_NoisePath;
m_WordPath = p_NumberPath;
m_WordPath = p_WordPath;
this.InitWordDics();
}
#endregion
#region 公有方法
/// <summary>
/// 加载词列表
/// </summary>
public void InitWordDics()
{
DateTime start = DateTime.Now;
if (GetCache("jcms_dict") == null)
{
htWords = new Hashtable();
Hashtable father = htWords;
Hashtable forfather = htWords;
string strChar1;
string strChar2;
StreamReader reader = new StreamReader(DicPath, System.Text.Encoding.UTF8);
string strline = reader.ReadLine();
SegList list;
Hashtable child = new Hashtable();
long i = 0;
while (strline != null && strline.Trim() != "")
{
i++;
strChar1 = strline.Substring(0, 1);
strChar2 = strline.Substring(1, 1);
if (!htWords.ContainsKey(strChar1))
{
father = new Hashtable();
htWords.Add(strChar1, father);
}
else
{
father = (Hashtable)htWords[strChar1];
}
if (!father.ContainsKey(strChar2))
{
list = new SegList();
if (strline.Length > 2)
list.Add(strline.Substring(2));
else
list.Add("null");
father.Add(strChar2, list);
}
else
{
list = (SegList)father[strChar2];
if (strline.Length > 2)
{
list.Add(strline.Substring(2));
}
else
{
list.Add("null");
}
father[strChar2] = list;
}
htWords[strChar1] = father;
strline = reader.ReadLine();
}
try
{
reader.Close();
}
catch
{ }
SetCache("jcms_dict", htWords);
}
htWords = (Hashtable)GetCache("jcms_dict");
alNoise = LoadWords(NoisePath, alNoise);
alNumber = LoadWords(NumberPath, alNumber);
alWord = LoadWords(WordPath, alWord);
alPrefix = LoadWords(PrefixPath, alPrefix);
TimeSpan duration = DateTime.Now - start;
m_EventTime = duration.TotalMilliseconds;
}
/// <summary>
/// 加载文本词组到ArrayList
/// </summary>
public ArrayList LoadWords(string strPath, ArrayList list)
{
StreamReader reader = new StreamReader(strPath, System.Text.Encoding.UTF8);
list = new ArrayList();
string strline = reader.ReadLine();
while (strline != null)
{
list.Add(strline);
strline = reader.ReadLine();
}
try
{
reader.Close();
}
catch
{ }
return list;
}
/// <summary>
/// 输出词列表
/// </summary>
public void OutWords()
{
IDictionaryEnumerator idEnumerator1 = htWords.GetEnumerator();
while (idEnumerator1.MoveNext())
{
IDictionaryEnumerator idEnumerator2 = ((Hashtable)idEnumerator1.Value).GetEnumerator();
while (idEnumerator2.MoveNext())
{
SegList aa = (SegList)idEnumerator2.Value;
for (int i = 0; i < aa.Count; i++)
{
Console.WriteLine(idEnumerator1.Key.ToString() + idEnumerator2.Key.ToString() + aa.GetElem(i).ToString());
}
}
}
}
/// <summary>
/// 输出ArrayList
/// </summary>
public void OutArrayList(ArrayList list)
{
if (list == null) return;
for (int i = 0; i < list.Count; i++)
{
Console.WriteLine(list[i].ToString());
}
}
/// <summary>
/// 分词过程,不支持回车
/// </summary>
/// <param name="strText">要分词的文本</param>
/// <returns>分词后的文本</returns>
public string SegmentText(string strText)
{
strText = (strText + "$").Trim();
if (htWords == null) return strText;
if (strText.Length < 3) return strText;
DateTime start = DateTime.Now;
int length = 0;
int preFix = 0;
bool word = false;
bool number = false;
string reText = "";
string strPrefix = "";
string strLastChar = "";
string strLastWords = Separator;
for (int i = 0; i < strText.Length - 1; i++)
{
#region 对于每一个字的处理过程
string strChar1 = strText.Substring(i, 1);
string strChar2 = strText.Substring(i + 1, 1).Trim();
bool yes;
SegList l;
Hashtable h;
if (reText.Length > 0) strLastChar = reText.Substring(reText.Length - 1);
if (strChar1 == " ")
{
if ((number || word) && strLastChar != Separator) reText += this.Separator;
yes = true;
}
else
yes = false;
int CharType = GetCharType(strChar1);
switch (CharType)
{
case 1:
#region 如果是数字,如果数字的上一位是字母要和后面的数字分开
if (word)
{
reText += Separator;
}
word = false;
number = true;
strLastWords = "";
break;
#endregion
case 2:
case 5:
#region 如果是字母
if (number)
strLastWords = Separator;
else
strLastWords = "";
word = true;
number = false;
break;
#endregion
case 3:
case 4:
#region 第一级哈希表是否包含关键字,假如包含处理第二级哈希表
//上一个字是否为字母
if (word) reText += Separator;
#region 检测上一个是否是数字,这个过程是用于修正数字后的量词的
if (number && CharType != 4)
{
h = (Hashtable)htWords["n"];
if (h.ContainsKey(strChar1))
{
l = (SegList)h[strChar1];
if (l.Contains(strChar2))
{
reText += strChar1 + strChar2 + Separator;
yes = true;
i++;
}
else if (l.Contains("null"))
{
reText += strChar1 + Separator;
yes = true;
}
}
else
reText += Separator;
}
#endregion
//非汉字数字的汉字
if (CharType == 3)
{
word = false;
number = false;
strLastWords = Separator;
}
else
{
word = false;
number = true;
strLastWords = "";
}
//第二级哈希表取出
h = (Hashtable)htWords[strChar1];
//第二级哈希表是否包含关键字
if (h.ContainsKey(strChar2))
{
#region 第二级包含关键字
//取出ArrayList对象
l = (SegList)h[strChar2];
//遍历每一个对象 看是否能组合成词
for (int j = 0; j < l.Count; j++)
{
bool have = false;
string strChar3 = l.GetElem(j).ToString();
//对于每一个取出的词进行检测,看是否匹配,长度保护
if ((strChar3.Length + i + 2) < strText.Length)
{
//向i+2后取出m长度的字
string strChar = strText.Substring(i + 2, strChar3.Length).Trim();
if (strChar3 == strChar && !yes)
{
if (strPrefix != "")
{
reText += strPrefix + Separator;
strPrefix = "";
preFix = 0;
}
reText += strChar1 + strChar2 + strChar;
i += strChar3.Length + 1;
have = true;
yes = true;
break;
}
}
else if ((strChar3.Length + i + 2) == strText.Length)
{
string strChar = strText.Substring(i + 2).Trim();
if (strChar3 == strChar && !yes)
{
if (strPrefix != "")
{
reText += strPrefix + Separator;
strPrefix = "";
preFix = 0;
}
reText += strChar1 + strChar2 + strChar;
i += strChar3.Length + 1;
have = true;
yes = true;
break;
}
}
if (!have && j == l.Count - 1 && l.Contains("null") && !yes)
{
if (preFix == 1)
{
reText += strPrefix + strChar1 + strChar2;
strPrefix = "";
preFix = 0;
}
else if (preFix > 1)
{
reText += strPrefix + strLastWords + strChar1 + strChar2;
strPrefix = "";
preFix = 0;
}
else
{
if (CharType == 4) reText += strChar1 + strChar2;
else reText += strChar1 + strChar2;
strLastWords = this.Separator;
number = false;
}
i++;
yes = true;
break;
}
else if (have)
{
break;
}
}
#endregion
//如果没有匹配还可能有一种情况,这个词语只有两个字,以这两个字开头的词语不存在
if (!yes && l.Contains("null"))
{
if (preFix == 1)
{
reText += strPrefix + strChar1 + strChar2;
strPrefix = "";
preFix = 0;
}
else if (preFix > 1)
{
reText += strPrefix + strLastWords + strChar1 + strChar2;
strPrefix = "";
preFix = 0;
}
else
{
if (CharType == 4) reText += strChar1 + strChar2;
else reText += strChar1 + strChar2;
strLastWords = this.Separator;
number = false;
}
i++;
yes = true;
}
if (reText.Length > 0) strLastChar = reText.Substring(reText.Length - 1);
if (CharType == 4 && GetCharType(strLastChar) == 4)
{
number = true;
}
else if (strLastChar != this.Separator) reText += this.Separator;
}
#endregion
break;
default:
#region 未知字符,可能是生僻字,也可能是标点符合之类
if (word && !yes)
{
reText += Separator;
}
else if (number && !yes)
{
reText += Separator;
}
number = false;
word = false;
strLastWords = this.Separator;
break;
#endregion
}
if (!yes && number || !yes && word)
{
reText += strChar1;
yes = true;
}
if (!yes)
{
#region 处理姓名问题
if (preFix == 0)
{
if (alPrefix.Contains(strChar1 + strChar2))
{
i++;
strPrefix = strChar1 + strChar2;
preFix++;
}
else if (alPrefix.Contains(strChar1))
{
if (!number)
{
strPrefix = strChar1;
preFix++;
}
else
{
reText += strChar1 + strLastWords;
number = false;
word = false;
}
}
else
{
if (preFix == 3)
{
reText += strPrefix + Separator + strChar1 + Separator;
strPrefix = "";
preFix = 0;
}
else if (preFix > 0)
{
if (Regex.IsMatch(strChar1, strChinese))
{
strPrefix += strChar1;
preFix++;
}
else
{
reText += strPrefix + Separator + strChar1 + Separator;
strPrefix = "";
preFix = 0;
}
}
else
{
reText += strChar1 + strLastWords;
number = false;
word = false;
}
}
}
else
{
if (preFix == 3)
{
reText += strPrefix + Separator + strChar1 + Separator;
strPrefix = "";
preFix = 0;
}
else if (preFix > 0)
{
if (Regex.IsMatch(strChar1, strChinese))
{
strPrefix += strChar1;
preFix++;
}
else
{
reText += strPrefix + Separator + strChar1 + Separator;
strPrefix = "";
preFix = 0;
}
}
else
{
reText += strChar1 + strLastWords;
number = false;
}
}
#endregion
}
length = i;
#endregion
}
#region 最后防止最后一个字的丢失
if (length < strText.Length - 1)
{
string strLastChar1 = strText.Substring(strText.Length - 1).Trim();
string strLastChar2 = strText.Substring(strText.Length - 2).Trim();
if (reText.Length > 0) strLastChar = reText.Substring(reText.Length - 1);
if (preFix != 0)
{
reText += strPrefix + strLastChar1;
}
else
{
switch (GetCharType(strLastChar1))
{
case 1:
if (strLastChar1 != "." && strLastChar1 != ".")
reText += strLastChar1;
else
reText += Separator + strLastChar1;
break;
case 2:
case 5:
if (alWord.Contains(strLastChar2))
reText += strLastChar1;
break;
case 3:
case 4:
if ((number || word) && strLastChar != Separator)
reText += Separator + strLastChar1;
else
reText += strLastChar1;
break;
default:
if (strLastChar != Separator)
reText += Separator + strLastChar1;
else
reText += strLastChar1;
break;
}
}
if (reText.Length > 0) strLastChar = (reText.Substring(reText.Length - 1));
if (strLastChar != this.Separator) reText += this.Separator;
}
#endregion
TimeSpan duration = DateTime.Now - start;
m_EventTime = duration.TotalMilliseconds;
return reText.Replace(" $", ""); //这里包含一个字的,则去掉
}
/// <summary>
/// 重载分词过程,支持回车
/// </summary>
public string SegmentText(string strText, bool Enter)
{
if (Enter)
{
DateTime start = DateTime.Now;
string[] strArr = strText.Split(‘\n‘);
string reText = "";
for (int i = 0; i < strArr.Length; i++)
{
reText += SegmentText(strArr[i]) + "\r\n";
}
TimeSpan duration = DateTime.Now - start;
m_EventTime = duration.TotalMilliseconds;
return reText;
}
else
{
return SegmentText(strText);
}
}
#region 判断字符类型
/// <summary>
/// 判断字符类型,0为未知,1为数字,2为字母,3为汉字,4为汉字数字
/// </summary>
private int GetCharType(string p_Char)
{
int CharType = 0;
if (alNumber.Contains(p_Char)) CharType = 1;
if (alWord.Contains(p_Char)) CharType = 2;
if (htWords.ContainsKey(p_Char)) CharType += 3;
return CharType;
}
#endregion
#region 对加载的词典排序并重新写入
/// <summary>
/// 对加载的词典排序并重新写入
/// </summary>
public void SortDic()
{
SortDic(false);
}
/// <summary>
/// 对加载的词典排序并重新写入
/// </summary>
/// <param name="Reload">是否重新加载</param>
public void SortDic(bool Reload)
{
DateTime start = DateTime.Now;
StreamWriter sw = new StreamWriter(DicPath, false, System.Text.Encoding.UTF8);
IDictionaryEnumerator idEnumerator1 = htWords.GetEnumerator();
while (idEnumerator1.MoveNext())
{
IDictionaryEnumerator idEnumerator2 = ((Hashtable)idEnumerator1.Value).GetEnumerator();
while (idEnumerator2.MoveNext())
{
SegList aa = (SegList)idEnumerator2.Value;
aa.Sort();
for (int i = 0; i < aa.Count; i++)
{
if (aa.GetElem(i).ToString() == "null")
sw.WriteLine(idEnumerator1.Key.ToString() + idEnumerator2.Key.ToString());
else
sw.WriteLine(idEnumerator1.Key.ToString() + idEnumerator2.Key.ToString() + aa.GetElem(i).ToString());
}
}
}
sw.Close();
if (Reload) InitWordDics();
TimeSpan duration = DateTime.Now - start;
m_EventTime = duration.TotalMilliseconds;
}
#endregion
/// <summary>
/// 删除两行完全相同的词,暂时无用!
/// </summary>
/// <returns>相同词条个数</returns>
public int Optimize()
{
int l = 0;
DateTime start = DateTime.Now;
Hashtable htOptimize = new Hashtable();
StreamReader reader = new StreamReader(DicPath, System.Text.Encoding.UTF8);
string strline = reader.ReadLine();
while (strline != null && strline.Trim() != "")
{
if (!htOptimize.ContainsKey(strline))
htOptimize.Add(strline, null);
else
l++;
}
Console.WriteLine("ready");
try
{
reader.Close();
}
catch { }
StreamWriter sw = new StreamWriter(DicPath, false, System.Text.Encoding.UTF8);
IDictionaryEnumerator ide = htOptimize.GetEnumerator();
while (ide.MoveNext())
sw.WriteLine(ide.Key.ToString());
try
{
sw.Close();
}
catch { }
TimeSpan duration = DateTime.Now - start;
m_EventTime = duration.TotalMilliseconds;
return l;
}
#endregion
}
SegList(分词辅助类)