下载文件
import requests
r = requests.get('http://i.pegpic.com/pic/028326/koqn0d5tgankoqn0d5tgan.jpg');
filename = 'D://a.jpg'
target = open(filename,'wb')
target.write(r._content);
target.close();
文件查找:import glob #获取指定目录下的所有图片print glob.glob(r"E:/Picture/*/*.jpg") #获取上级目录的所有.py文件print glob.glob(r'../*.py') #相对路径
目录文件处理
#coding: utf-8
import os;
import re;
#主函数
def main(fpath):
#枚举fpath目录下的每个文件
for root, dirs, files in os.walk( fpath):
for filename in files:
full_file = '%s\%s' % (root,filename)
print full_file
process_a_file(full_file);
#处理单个文件
def process_a_file(afile):
print afile;
ofile = open(afile,'r')
wfile = open("out.txt",'a')
str_head = "NEW DOC %s
" % afile
wfile.write(str_head)
do_write = 0;
for eachline in ofile:
aline = eachline.rstrip()
if None <> re.match('"*"',aline):
if None <> re.match('"toBI"',aline):
do_write = 1;
else:
do_write = 0;
else:
if do_write == 1:
wfile.write(aline);
wfile.write("
")
wfile.close()
ofile.close()
#主函数调用
main('D:Temp');
文件统计:
#coding: utf-8
import os;
import re;
import sys;
dic_word = {'test' : 1};
dic_phone = {'HH' : 1};
dic_b_phone = {'HH HA' : 1};
dic_t_phone = {'HH HA HA' : 1};
dic_sen_len = {8: 10};#长度为8个单词的句子有10个
dic_rhythm_word_len = {2 : 5 };#韵律长度统计,单词级别,长度为2个word的韵律短语有5个
dic_rhythm_phone_len = {2 : 5};#韵律长度统计,phone级别,长度为2个phome的韵律短语有5个
#主函数
def main(fpath):
dic_word.clear();
dic_phone.clear();
dic_b_phone.clear();
dic_t_phone.clear();
dic_sen_len.clear();
dic_rhythm_word_len.clear();
dic_rhythm_phone_len.clear();
#处理文件
ofile = open(fpath,'r');
order = 1;
for eachline in ofile:
aline = eachline;
if (order % 2 == 1):# 句子行处理
process_word_line(aline[7:]);
else: #音标行处理
process_phone_line(aline);
order = order + 1;
#输出结果
analysis_out();
def process_word_line(aline):#单词行处理
aline = aline.lower();
#韵律短语统计
rhythm_word_calc(aline);# /%隔开统计word级别的韵律短语
#换掉斜线等符号
aline = clean_aline(aline);
#切割统计
words = aline.split(' ');
words = [w for w in words if w != ""]
#统计句子长度
dic_add(dic_sen_len,len(words));
#统计单词
for word in words:
dic_add(dic_word, word);
return;
def process_phone_line(aline): #音标行处理
#韵律短语统计
rhythm_phone_calc(aline);# /.隔开统计phome级别的韵律短语
#换掉斜线等符号
aline = clean_aline(aline);
#切割统计
phones = aline.split(' ');
#phones.remove('');
phones = [w for w in phones if w != ""]
for phone in phones:
dic_add(dic_phone, phone);
#统计双音子
for i in range(0, len(phones)-1):
b_phone = "%s-%s" % (phones[i], phones[i+1]);
dic_add(dic_b_phone, b_phone);
#统计三音子
for i in range(0, len(phones)-2):
t_phone = "%s-%s-%s" % (phones[i], phones[i+1], phones[i+2]);
dic_add(dic_t_phone, t_phone);
return;
def rhythm_word_calc(aline):# 单词以/或者%结尾表示一个韵律短语
aline = clean_aline_word_rhythm(aline);
#print(aline)
words = aline.split(' ');
words = [w for w in words if w != ""]
#print(len(words));
pre_pos = -1;
for i in range(0, len(words)):
if (words[i].find('/') != -1 or words[i].find('%') != -1):
dic_add(dic_rhythm_word_len, i-pre_pos);
#print 'found %d' % (i-pre_pos);
pre_pos = i;
#print(words[i]);
return;
def rhythm_phone_calc(aline):# 单词以/或者%结尾表示一个韵律短语
aline = clean_aline_phone_rhythm(aline);
#print(aline)
phones = aline.split(' ');
phones = [w for w in phones if w != ""]
#print(len(phones));
pre_pos = -1;
for i in range(0, len(phones)):
if (phones[i].find('/') != -1 or phones[i].find('.') != -1):
dic_add(dic_rhythm_phone_len, i-pre_pos);
#print 'found %d' % (i-pre_pos);
pre_pos = i;
#print(phones[i]);
return;
#output results
def analysis_out():
output_a_dic(dic_word, 'word.log');
output_a_dic(dic_phone, 'phone.log');
output_a_dic(dic_sen_len, 'sen_len.log');
output_a_dic(dic_b_phone, 'dic_bi_phone.log');
output_a_dic(dic_t_phone, 'dic_tri_phone.log');
output_a_dic(dic_rhythm_word_len, 'dic_rhythm_word_len.log');
output_a_dic(dic_rhythm_phone_len, 'dic_rhythm_phone_len.log');
return;
def output_a_dic(a_dic, filename):
a_list = sorted(a_dic.iteritems(), key = lambda asd:asd[1], reverse = True);#value进行排序
wfile = open(filename,'w')
for a_turp in a_list:
aline = '%s %d
' % (a_turp[0], a_turp[1])
wfile.write(aline);
wfile.close();
return;
#辅助函数
def clean_aline(aline):
#print(aline);
regex = re.compile(r"/s", re.IGNORECASE);
aline = regex.sub(" ",aline);
regex = re.compile(r".s", re.IGNORECASE);
aline = regex.sub("",aline);
regex = re.compile(r",s", re.IGNORECASE);
aline = regex.sub("",aline);
regex = re.compile(r"!s", re.IGNORECASE);
aline = regex.sub("",aline);
regex = re.compile(r""s", re.IGNORECASE);
aline = regex.sub("",aline);
aline = aline.replace("%"," ");
aline = aline.rstrip();
aline = aline.lstrip();
#print(aline);
return aline;
def clean_aline_word_rhythm(aline):
regex = re.compile(r".s", re.IGNORECASE);
aline = regex.sub("",aline);
regex = re.compile(r",s", re.IGNORECASE);
aline = regex.sub(" ",aline);
regex = re.compile(r"!s", re.IGNORECASE);
aline = regex.sub("",aline);
regex = re.compile(r""s", re.IGNORECASE);
aline = regex.sub("",aline);
aline = aline.replace('"','');
aline = aline.rstrip();
aline = aline.lstrip();
return aline;
def clean_aline_phone_rhythm(aline):
regex = re.compile(r",s", re.IGNORECASE);
aline = regex.sub(" ",aline);
regex = re.compile(r"!s", re.IGNORECASE);
aline = regex.sub("",aline);
regex = re.compile(r""s", re.IGNORECASE);
aline = regex.sub("",aline);
aline = aline.replace(" /","/");
aline = aline.replace(" .",".");
aline = aline.rstrip();
aline = aline.lstrip();
aline = aline + '.'
return aline;
def dic_add(adic, akey):
if adic.has_key(akey) != True:
adic[akey] = 1;
else:
v = adic[akey];
adic[akey] = v + 1;
return ;
if len(sys.argv) == 1:#默认参数
#主函数调用
main('C:\Users\huangzhiqiang\PycharmProjects\untitled\ef4_6k.txt');
elif len(sys.argv) == 2:
print(sys.argv[1]);
main(str(sys.argv[1]));
else:
print("parameters error
");
#main('C:\Users\huangzhiqiang\PycharmProjects\untitled\ef4_6k.txt');