Py_挺强的Urls识别正则式

发布于 2019-10-14  27 次阅读


# -*- coding:utf-8 -*-
import re

def find_all_url(sentence, show_urls=None, delete_urls=None):
    r = re.compile( r'(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))')
    url_list = r.findall(sentence)
    if show_urls == 1:
        print("find", str(len(url_list)), "URLs")
        for i in url_list:
            print (i[0])

    if delete_urls == 1:
        for j in url_list:
            # sentence = sentence.replace(j[0], '<URL>')
            sentence = sentence.replace(j[0], '')
        return sentence
    return 1

if __name__ == '__main__':
    s1 = 'qwq?@!#哈www.fjnu.edu.cn'
    s2 = 'my便http:当头 www.fjnu.edu.cn'
    s3 = 'http://  http://goo.gl/BmT8gZ 匹配了 https://goo.gl/MxRdMO'
    s4 = '我的邮箱是Emal:http:// http://www.plsseer0qaq@gmail.com 识别可能会有些问题但是完全没啥'

    find_all_url(s4)
    #print (find_all_url(s4, delete_urls=1))
    find_all_url(s4, show_urls=1)
    print ('----------')
    find_all_url(s3)
    #print (find_all_url(s3, delete_urls=1))
    find_all_url(s3, show_urls=1)
    print('----------')
    find_all_url(s2)
    # print (find_all_url(s2, delete_urls=1))
    find_all_url(s2, show_urls=1)
    print('----------')
    find_all_url(s1)
    # print (find_all_url(s1, delete_urls=1))
    find_all_url(s1, show_urls=1)




(?i)\b((?:[a-z][\w-]+:(?:/{1,3}|[a-z0-9%])|www\d{0,3}[.]|[a-z0-9.\-]+[.][a-z]{2,4}/)(?:[^\s()<>]+|\(([^\s()<>]+|(\([^\s()<>]+\)))*\))+(?:\(([^\s()<>]+|(\([^\s()<>]+\)))*\)|[^\s`!()\[\]{};:\'".,<>?«»“”‘’]))

喜欢这篇文章吗,不妨分享给朋友们吧!

科学是第一生产力