首页 > 代码库 > html转换为纯文本,支持撇号
html转换为纯文本,支持撇号
/// <summary> /// html转换为纯文本 /// </summary> /// <param name="source"></param> /// <returns></returns> private static string HtmlToPlainText(string source) { string result; //remove line breaks,tabs result = source.Replace("\r", " "); result = result.Replace("\n", " "); result = result.Replace("\t", " "); //remove the header result = Regex.Replace(result, "(<head>).*(</head>)", string.Empty, RegexOptions.IgnoreCase); result = Regex.Replace(result, @"<( )*script([^>])*>", "<script>", RegexOptions.IgnoreCase); result = Regex.Replace(result, @"(<script>).*(</script>)", string.Empty, RegexOptions.IgnoreCase); //remove all styles result = Regex.Replace(result, @"<( )*style([^>])*>", "<style>", RegexOptions.IgnoreCase); //clearing attributes result = Regex.Replace(result, "(<style>).*(</style>)", string.Empty, RegexOptions.IgnoreCase); //insert tabs in spaces of <td> tags result = Regex.Replace(result, @"<( )*td([^>])*>", " ", RegexOptions.IgnoreCase); //insert line breaks in places of <br> and <li> tags result = Regex.Replace(result, @"<( )*br( )*>", "\r", RegexOptions.IgnoreCase); result = Regex.Replace(result, @"<( )*li( )*>", "\r", RegexOptions.IgnoreCase); //insert line paragraphs in places of <tr> and <p> tags result = Regex.Replace(result, @"<( )*tr([^>])*>", "\r\r", RegexOptions.IgnoreCase); result = Regex.Replace(result, @"<( )*p([^>])*>", "\r\r", RegexOptions.IgnoreCase); //remove anything thats enclosed inside < > result = Regex.Replace(result, @"<[^>]*>", string.Empty, RegexOptions.IgnoreCase); //replace special characters: result = Regex.Replace(result, @"&", "&", RegexOptions.IgnoreCase); result = Regex.Replace(result, @" ", " ", RegexOptions.IgnoreCase); result = Regex.Replace(result, @"<", "<", RegexOptions.IgnoreCase); result = Regex.Replace(result, @">", ">", RegexOptions.IgnoreCase); result = Regex.Replace(result, @"'", "‘", RegexOptions.IgnoreCase); result = Regex.Replace(result, @"&(.{2,6});", string.Empty, RegexOptions.IgnoreCase); //remove extra line breaks and tabs result = Regex.Replace(result, @" ( )+", " "); result = Regex.Replace(result, "(\r)( )+(\r)", "\r\r"); result = Regex.Replace(result, @"(\r\r)+", "\r\n"); return result; }
html转换为纯文本,支持撇号
声明:以上内容来自用户投稿及互联网公开渠道收集整理发布,本网站不拥有所有权,未作人工编辑处理,也不承担相关法律责任,若内容有误或涉及侵权可进行投诉: 投诉/举报 工作人员会在5个工作日内联系你,一经查实,本站将立刻删除涉嫌侵权内容。