• 企业400电话
  • 微网小程序
  • AI电话机器人
  • 电商代运营
  • 全 部 栏 目

    企业400电话 网络优化推广 AI电话机器人 呼叫中心 网站建设 商标✡知产 微网小程序 电商运营 彩铃•短信 增值拓展业务
    python文本处理的方案(结巴分词并去除符号)

    看代码吧~

    import re
    import jieba.analyse
    import codecs
    import pandas as pd
    def simplification_text(xianbingshi):
        """提取文本"""
        xianbingshi_simplification = []
        with codecs.open(xianbingshi,'r','utf8') as f:
            for line in f :
                line = line.strip()
                line_write = re.findall('(?=\b\&;).*?(?=\e\&;)',line)
                for line in line_write:
                    xianbingshi_simplification.append(line)
        with codecs.open(r'C:\Users\Administrator.SC-201812211013\PycharmProjects\untitled29\yiwoqu\code\xianbingshi_write.txt','w','utf8') as f:
            for line in xianbingshi_simplification:
                f.write(line + '\n')
    def jieba_text():
        """"""
        word_list = []
        data = open(r"C:\Users\Administrator.SC-201812211013\PycharmProjects\untitled29\xianbingshi_write.txt", encoding='utf-8').read()
        seg_list = jieba.cut(data, cut_all=False)  # 精确模式
        for i in seg_list:
            word_list.append(i.strip())
        data_quchong = pd.DataFrame({'a':word_list})
        data_quchong.drop_duplicates(subset=['a'],keep='first',inplace=True)
        word_list = data_quchong['a'].tolist()
        with codecs.open('word.txt','w','utf8')as w:
            for line in word_list:
                w.write(line + '\n')
    def word_messy(word):
        """词语提炼"""
        word_sub_list = []
        with codecs.open(word,'r','utf8') as f:
            for line in f:
                line_sub = re.sub("^[1-9]\d*\.\d*|^[A-Za-z0-9]+$|^[0-9]*$|^(-?\d+)(\.\d+)?$|^[A-Za-z0-9]{4,40}.*?",'',line)
                word_sub_list.append(line_sub)
        word_sub_list.sort()
        with codecs.open('word.txt','w','utf8')as w:
            for line in word_sub_list:
                w.write(line.strip("\n") + '\n')
    if __name__ == '__main__':
        xianbingshi = r'C:\Users\Administrator.SC-201812211013\PycharmProjects\untitled29\yiwoqu\xianbingshi_sub_sen_all(1).txt'
        # simplification_text(xianbingshi)
        # word = r'C:\Users\Administrator.SC-201812211013\PycharmProjects\untitled29\word.txt'
        simplification_text(xianbingshi)

    补充:python 进行结巴分词 并且用re去掉符号

    看代码吧~

    # 把停用词做成字典
    stopwords = {}
    fstop = open('stop_words.txt', 'r',encoding='utf-8',errors='ingnore')
    for eachWord in fstop:
        stopwords[eachWord.strip()] = eachWord.strip()  #停用词典
    fstop.close()
    f1=open('all.txt','r',encoding='utf-8',errors='ignore')
    f2=open('allutf11.txt','w',encoding='utf-8')
    line=f1.readline()
    while line:
        line = line.strip()  #去前后的空格
        line = re.sub(r"[0-9\s+\.\!\/_,$%^*()?;;:-【】+\"\']+|[+——!,;:。?、~@#¥%……*()]+", " ", line) #去标点符号
        seg_list=jieba.cut(line,cut_all=False)  #结巴分词
        outStr=""
        for word in seg_list:
            if word not in stopwords:
                outStr+=word
                outStr+=" "
        f2.write(outStr)
        line=f1.readline()
    f1.close()
    f2.close()

    以上为个人经验,希望能给大家一个参考,也希望大家多多支持脚本之家。

    您可能感兴趣的文章:
    • Python jieba结巴分词原理及用法解析
    • python中文分词,使用结巴分词对python进行分词(实例讲解)
    • Python 结巴分词实现关键词抽取分析
    • Python中文分词工具之结巴分词用法实例总结【经典案例】
    • Python结巴中文分词工具使用过程中遇到的问题及解决方法
    上一篇:Django操作cookie的实现
    下一篇:python numpy中multiply与*及matul 的区别说明
  • 相关文章
  • 

    © 2016-2020 巨人网络通讯 版权所有

    《增值电信业务经营许可证》 苏ICP备15040257号-8

    python文本处理的方案(结巴分词并去除符号) python,文本,处理,的,方案,