OS模块

能与操作系统交互，控制文件 / 文件夹

# 创建文件夹
import os
os.mkdir(r'D:\py_case\test')

# 删除文件夹
os.rmdir(r'D:\py_case\test')

# # 列出指定目录下所有文件和子目录 （子目录文件不会列出来）
res = os.listdir(r'D:\pycharm_project\Test')
print(res)
# 打印结果：['.idea', 'ex16.py', 'test.json', 'test.pkl', 'test.py']

# 列出当前文件所在的文件夹
res = os.getcwd()
print(res)
# 打印结果： D:\pycharm_project\Test

# 列出当前文件所在的具体路径
# __file__ pycharm独有的
res = os.path.abspath(__file__)  # 根据不同的操作系统更换不同的/或\
print(res)   # D:\pycharm_project\Test\test.py

# 文件的文件夹
res = os.path.dirname(os.path.abspath(__file__))
print(res)   # D:\pycharm_project\Test
res = os.path.dirname(os.path.dirname(os.path.abspath(__file__)))
print(res)    # D:\pycharm_project


# 拼接文件路径
res = os.path.join(os.path.dirname(os.path.abspath(__file__)), 'img', 'test.jpg')
print(res)   # D:\pycharm_project\Test\img\test.jpg


# 判断路径是否存在（文件 / 文件夹都适用）
res = os.path.exists(r'D:\pycharm_project\Test\ex16.py')
print(res)   # True


# 判断是否为文件
res = os.path.isfile(r'D:\pycharm_project\Test\aaa')
print(res)  # False


# 删除文件
os.remove('test.txt')

# 重命名文件
os.rename('test.txt', 'test2.txt')

# 判断是否为文件夹
res = os.path.isdir(r'D:\pycharm_project\Test\aaa')
print(res)   #True


# 与终端交互，输入命令
res = os.system('dir')
print(res)
res = os.walk(r'D:\pycharm_project\Test\aaa')
print(res)   # <generator object walk at 0x0000027DB46916D8>


res = os.walk(r'D:\pycharm_project\Test')
# print(res)
for dir, _, files in res:
    # print(i) # 所有文件夹名
    # print(l)  # 某个文件下对应的所有文件名
    for file in files:
        file_path = os.path.join(dir,file)  # 拼接成所有文件名
        if file_path.endswith('py'):   # 拿到py文件
            print(file_path)
            
            
# 打印结果：
'''
D:\pycharm_project\Test\ex16.py
D:\pycharm_project\Test\test.py
D:\pycharm_project\Test\aaa\12.py
'''

方法	详解
os.getcwd()	获取当前工作目录，即当前python脚本工作的目录路径
os.chdir(“dirname”)	改变当前脚本工作目录；相当于shell下cd
os.curdir	返回当前目录: (‘.’)
os.pardir	获取当前目录的父目录字符串名：(‘..’)
os.makedirs(‘dirname1/dirname2’)	可生成多层递归目录
os.removedirs(‘dirname1’)	若目录为空，则删除，并递归到上一级目录，如若也为空，则删除，依此类推
os.mkdir(‘dirname’)	生成单级目录；相当于shell中mkdir dirname
os.rmdir(‘dirname’)	删除单级空目录，若目录不为空则无法删除，报错；相当于shell中rmdir dirname
os.listdir(‘dirname’)	列出指定目录下的所有文件和子目录，包括隐藏文件，并以列表方式打印
os.remove()	删除一个文件
os.rename(“oldname”,”newname”)	重命名文件/目录
os.stat(‘path/filename’)	获取文件/目录信息
os.sep	输出操作系统特定的路径分隔符，win下为””,Linux下为”/“
os.linesep	输出当前平台使用的行终止符，win下为”\t\n”,Linux下为”\n”
os.pathsep	输出用于分割文件路径的字符串 win下为;,Linux下为:
os.name	输出字符串指示当前使用平台。win->’nt’; Linux->’posix’
os.system(“bash command”)	运行shell命令，直接显示
os.environ	获取系统环境变量
os.path.abspath(path)	返回path规范化的绝对路径
os.path.split(path)	将path分割成目录和文件名二元组返回
os.path.dirname(path)	返回path的目录。其实就是os.path.split(path)的第一个元素
os.path.basename(path)	返回path最后的文件名。如何path以／或\结尾，那么就会返回空值。即os.path.split(path)的第二个元素
os.path.exists(path)	如果path存在，返回True；如果path不存在，返回False
os.path.isabs(path)	如果path是绝对路径，返回True
os.path.isfile(path)	如果path是一个存在的文件，返回True。否则返回False
os.path.isdir(path)	如果path是一个存在的目录，则返回True。否则返回False
os.path.join(path1[, path2[, …]])	将多个路径组合后返回，第一个绝对路径之前的参数将被忽略
os.path.getatime(path)	返回path所指向的文件或者目录的最后存取时间
os.path.getmtime(path)	返回path所指向的文件或者目录的最后修改时间
os.path.getsize(path)	返回path的大小

sys模块

与python解释器交互

import sys

## 最常用，当使用命令行式运行文件，接收多余的参数
res = sys.argv
print(res)

方法	详解
sys.argv	命令行参数List，第一个元素是程序本身路径
sys.modules.keys()	返回所有已经导入的模块列表
sys.exc_info()	获取当前正在处理的异常类,exc_type、exc_value、exc_traceback当前处理的异常详细信息
sys.exit(n)	退出程序，正常退出时exit(0)
sys.hexversion	获取Python解释程序的版本值，16进制格式如：0x020403F0
sys.version	获取Python解释程序的版本信息
sys.maxint	最大的Int值
sys.maxunicode	最大的Unicode值
sys.modules	返回系统导入的模块字段，key是模块名，value是模块
sys.path	返回模块的搜索路径，初始化时使用PYTHONPATH环境变量的值
sys.platform	返回操作系统平台名称
sys.stdout	标准输出
sys.stdin	标准输入
sys.stderr	错误输出
sys.exc_clear()	用来清除当前线程所出现的当前的或最近的错误信息
sys.exec_prefix	返回平台独立的python文件安装的位置
sys.byteorder	本地字节规则的指示器，big-endian平台的值是’big’,little-endian平台的值是’little’
sys.copyright	记录python版权相关的东西
sys.api_version	解释器的C的API版本

json和pickle模块

序列化和反序列化

序列化：按照特定的规则排列，把python数据类型转化为json串，便于跨平台传输
反序列化：把json串转化为python / java / c / php 需要的语言

Json序列化并不是python独有的，json序列化在java等语言中也会涉及到，因此使用json序列化能够达到跨平台传输数据的目的。

json数据类型和python数据类型对应关系表

Json类型	Python类型
{}	dict
[]	list
“string”	str
520.13	int或float
true/false	True/False
null	None

json模块

dic = {'a': 1, 'b': 2, 'c': None}

data = json.dumps(dic)   # json串中没有单引号
print(type(data), data)
data = json.loads(data)
print(type(data), data)

# 打印结果：
'''
<class 'str'> {"a": 1, "b": 2, "c": null}
<class 'dict'> {'a': 1, 'b': 2, 'c': None}
'''
dic = {'a': 1, 'b': 2, 'c': None}

# 序列化字典为json串，并保存文件
with open('test.json', 'w', encoding='utf-8') as fw:
    json.dump(dic, fw)
    
# 反序列化
with open(f'{"test"}.json', 'r', encoding='utf-8') as fr:
    data = json.load(fr)
    print(type(data), data)  # <class 'dict'> {'a': 1, 'b': 2, 'c': None}

pickle

Pickle序列化和所有其他编程语言特有的序列化问题一样，它只能用于Python。但是pickle的好处是可以存储Python中的所有的数据类型，包括对象，而json不可以。

import pickle

se = {1, 3, 4, 5, 6}
with open('test.pkl', 'wb') as fw:
    pickle.dump(se, fw)
se = {1, 3, 4, 5, 6}

def func():
    x = 3
    def wrapper():
        print(x)
    return wrapper

with open('test.pkl', 'wb') as fw:
    pickle.dump(func, fw)

with open('test.pkl', 'rb') as fr:
    data = pickle.load(fr)
    # print(data)
    res = data()
    res()

logging模块

v1:

import logging

# 日志级别（如果不设置，默认显示30以上）
logging.info('info')   # 10
logging.debug('debug')     # 20
logging.warning('warning')   # 30
logging.error('error')     # 40
logging.critical('critical')  # 50

# 打印结果：
'''
WARNING:root:warning
ERROR:root:error
CRITICAL:root:critical
'''

v2:

import logging

# 日志的基本配置

logging.basicConfig(filename='access.log',
                    format='%(asctime)s - %(name)s - %(levelname)s -%(module)s: %(message)s',
                    datefmt='%Y-%m-%d %H:%M:%S %p',
                    level=10)  # level是等级，10以上的都记录日志

logging.info('正常信息')  # 10
logging.debug('调试信息')  # 20
logging.warning('警告信息')  # 30
logging.error('报错信息')  # 40
logging.critical('严重错误信息')  # 50


# 会创建一个access.log日志：
'''
2019-09-27 21:57:45 PM - root - DEBUG -test: 调试信息
2019-09-27 21:57:45 PM - root - INFO -test: 正常信息
2019-09-27 21:57:45 PM - root - WARNING -test: 警告信息
2019-09-27 21:57:45 PM - root - ERROR -test: 报错信息
2019-09-27 21:57:45 PM - root - CRITICAL -test: 严重错误信息

'''

v3: 自定义配置

import logging

# 1. 配置logger对象
cwz_logger = logging.Logger('cwz')
neo_logger = logging.Logger('neo')

# 2. 配置格式
formmater1 = logging.Formatter('%(asctime)s - %(name)s -%(thread)d - %(levelname)s -%(module)s:  %(message)s',
                               datefmt='%Y-%m-%d %H:%M:%S %p ', )

formmater2 = logging.Formatter('%(asctime)s :  %(message)s',
                               datefmt='%Y-%m-%d %H:%M:%S %p', )

formmater3 = logging.Formatter('%(name)s %(message)s', )

# 3. 配置handler --> 往文件打印or往终端打印
h1 = logging.FileHandler('cwz.log')
h2 = logging.FileHandler('neo.log')
sm = logging.StreamHandler()

# 4. 给handler配置格式
h1.setFormatter(formmater1)
h2.setFormatter(formmater2)
sm.setFormatter(formmater3)

# 5. 把handler绑定给logger对象
cwz_logger.addHandler(h1)
cwz_logger.addHandler(sm)
neo_logger.addHandler(h2)

# 6. 直接使用
cwz_logger.info(f'cwz 购买 变形金刚 8个')

time模块

时间戳

1
2
3

import time

print(time.time())   # 从1970年1月1日00:00:00开始计算到现在的秒数

格式化时间

import time 
print(time.strftime('%Y-%m-%d %H:%M:%S'))  # 2019-09-28 17:15:47

print(time.strftime('%Y-%m-%d %X'))   # 2019-09-28 17:16:50

结构化时间

import time
print(time.localtime())   # time.struct_time(tm_year=2019, tm_mon=9, tm_mday=28, tm_hour=17, tm_min=18, tm_sec=11, tm_wday=5, tm_yday=271, tm_isdst=0)
# 结构化基础时间
import time
print(time.localtime(0))   # time.struct_time(tm_year=1970, tm_mon=1, tm_mday=1, tm_hour=8, tm_min=0, tm_sec=0, tm_wday=3, tm_yday=1, tm_isdst=0)

sleep

import time 

start = time.time()
time.sleep(2)
end = time.time()
print(f'暂停了{end - start}秒')  # 暂停了2.000108003616333秒

文本进度条展示

import time

print(time.time())  # 从1970.01.01.00:00开始计算时间
import time

print('-------')
time.sleep(3)  # 睡眠
print('-------')
# cpu级别的时间计算，一般用于程序耗时时间计算
import time

start = time.perf_counter()
for i in range(10):
    print(i)
    time.sleep(0.01)
print(time.perf_counter() - start)

# 打印结果：
0
1
2
3
4
5
6
7
8
9
0.10681829999999998

文本进度条

'''
 0 %[->..........]
10 %[*->.........]
20 %[**->........]
30 %[***->.......]
40 %[****->......]
50 %[*****->.....]
60 %[******->....]
70 %[*******->...]
80 %[********->..]
90 %[*********->.]
100%[**********->]
'''

简单开始

星号在递增，小点在递减，用两个循环

for i in range(10):
    print('*'* i + '.' * (10 - i))

# 打印结果：
..........
*.........
**........
***.......
****......
*****.....
******....
*******...
********..
*********.
for i in range(10):
    print(f'[{"*" * i} -> {"." * (10 - i)}]')
    
 # 打印结果：
[ -> ..........]
[* -> .........]
[** -> ........]
[*** -> .......]
[**** -> ......]
[***** -> .....]
[****** -> ....]
[******* -> ...]
[******** -> ..]
[********* -> .]
for i in range(10):
    print(f'{i*10: ^3}% [{"*" * i} -> {"." * (10 - i)}]')
    
# 打印结果：
 0 % [ -> ..........]
10 % [* -> .........]
20 % [** -> ........]
30 % [*** -> .......]
40 % [**** -> ......]
50 % [***** -> .....]
60 % [****** -> ....]
70 % [******* -> ...]
80 % [******** -> ..]
90 % [********* -> .]

继续修改

scale = 11
for i in range(scale):
    print(f'{(i/scale)*scale: ^3.1f}% [{"*" * i} -> {"." * (scale - i)}]')
    
# 打印结果：
0.0% [ -> ...........]
1.0% [* -> ..........]
2.0% [** -> .........]
3.0% [*** -> ........]
4.0% [**** -> .......]
5.0% [***** -> ......]
6.0% [****** -> .....]
7.0% [******* -> ....]
8.0% [******** -> ...]
9.0% [********* -> ..]
10.0% [********** -> .]

单条显示

scale = 101
for i in range(scale):
    print(f'\r{(i/scale)*scale: ^3.1f}% [{"*" * i} -> {"." * (scale - i)}]', end='')
    
# 打印结果：
100.0% [**************************************************************************************************** -> .]

文本进度条最终形式

import time

start = time.perf_counter()
scale = 101
for i in range(scale):
    print(f'\r{(i / scale) * scale: ^3.1f}% [{"*" * i} -> {"." * (scale - i)}] {time.perf_counter() - start:.2f}s',
          end='')
    time.sleep(0.1)

datetime模块

import datetime

# 输出当前时间
print(datetime.datetime.now())
# 2019-09-28 17:25:24.551237


# 加时间
now = datetime.datetime.now()
print(now + datetime.timedelta(days=3))
# 2019-10-01 17:28:24.710093


# 时间替换
print(now.replace(year=1940))
# 1940-09-28 17:29:45.066855

random模块

import random

# 0-1随机数
print(random.random())


# 1-100随机整数
print(random.randint(1,100))

# 1-3之间随机整数
print(random.randrange(1, 3))

# 打乱lt的顺序
lt = [1,2,4,60]
random.shuffle(lt)   # [4, 1, 60, 2]
print(lt)

# 随机选择lt中一个元素
print(random.choice(lt))

# random.seed
import random

random.seed(4)   # 给一个随机数种子
print(random.random())   # 只第一次随机生成，之后生成的数字就一样了
print(random.random())

# 如果不自定义种子，则种子按照当前的时间来

计算圆周率

公式法计算

圆周率计算公式：

$\pi = \sum_{k=0}^\infty [\frac{1}{16^k} (\frac{4}{8k+1}-\frac{2}{8k+4}-\frac{1}{8k+5}-\frac{1}{8k+6})]$

pi = 0
k = 0
while True:
    pi += (1 / (16 ** k)) * (4 / (8 * k + 1) - 2 / (8 * k + 4) - 1 / (8 * k + 5) - 1 / (8 * k + 6))

    print(pi)
    k += 1

蒙特卡罗方法计算圆周率

import random

count = 0
for i in range(1000000):
    x, y = random.random(), random.random()
    distance = pow(x**2 + y**2, 0.5)
    if distance < 1:
        count += 1
print(count/1000000*4)

typing模块

与函数联用，控制函数参数的数据类型，提供了基础数据类型之外的数据类型

from typing import Iterable

def func(x:int, lt:list) -> list:
    return [1,2,3]

func(10,[12,3])

typing模块作用

类型检查，防止运行时出现参数和返回值类型不符合。
作为开发文档附加说明，方便使用者调用时传入和返回参数类型。
该模块加入后并不会影响程序的运行，不会报正式的错误，只有提醒。
注意：typing模块只有在python3.5以上的版本中才可以使用,pycharm目前支持typing检查

typing常用类型

int、long、float: 整型、长整形、浮点型
bool、str: 布尔型、字符串类型
List、 Tuple、 Dict、 Set:列表、元组、字典、集合
Iterable、Iterator:可迭代类型、迭代器类型
Generator：生成器类型

hashlib模块

hash是什么

hash是一种算法（Python3.版本里使用hashlib模块代替了md5模块和sha模块，主要提供 SHA1、SHA224、SHA256、SHA384、SHA512、MD5 算法），该算法接受传入的内容，经过运算得到一串hash值。

import hashlib

m = hashlib.md5()
m.update(b'sayhello')   # 981fe96ed23ad8b9554cfeea38cd334a
print(m.hexdigest())   # 对于不同的字符而言，永不重复

撞库破解hash算法

pwd_list = [
    'hash3714',
    'hash1313',
    'hash94139413',
    'hash123456',
    '123456hash',
    'h123ash',
]

hash_pwd = '0562b36c3c5a3925dbe3c4d32a4f2ba2'

for pwd in pwd_list:
    m = hashlib.md5()
    m.update(pwd.encode('utf-8'))
    res = m.hexdigest()
    if res in hash_pwd:
        print(f'获取密码成功：{pwd}')  # 获取密码成功：hash123456

hmac模块

密钥加盐

import hmac

m = hmac.new(b'haha')
m.update(b'hash123456')
print(m.hexdigest())    # 24bb8daab11e526fc9b8178e51bc2ae7


m = m = hmac.new(b'sadness')
m.update(b'hash123456')
print(m.hexdigest())   # df405ffd019d6d3cd9a190fcab33aca5

re模块

作用就是去字符串找符合某种特点的字符串

re模块的基本使用：

1
2
3

import re  # 第一步，要引入re模块
a = re.findall("匹配规则", "这个字符串是否有匹配规则的字符")  # 第二步，调用模块函数
print(a)  # 以列表形式返回匹配到的字符串

^字符

以……开头

s = 'abcdabc'

res = re.findall('^abc',s)  
print(res)                  # ['abc']
res = re.findall('^bc',s)
print(res)                  # []

$ 字符

以……结尾

s = 'abcdabc'

res = re.findall('bc$',s)
print(res)             # ['bc']

. : 任意字符

s = 'abc是dabc'

res = re.findall('abc.',s)
print(res)   # ['abc是']

\d：数字

s = 'asdhg213214h4c'

res = re.findall('\d',s)
print(res)   # ['2', '1', '3', '2', '1', '4', '4']

\D: 非数字

s = 'asdhg2132 -14h4c'

res = re.findall('\D',s)
print(res)   # ['a', 's', 'd', 'h', 'g', ' ', '-', 'h', 'c']

\w: 非空, 数字、字母、下划线

s = 'asdhg213214h4c'

res = re.findall('\w',s)
print(res)   # ['a', 's', 'd', 'h', 'g', '2', '1', '3', '2', '1', '4', 'h', '4', 'c']

\W: 空，除了数字、字母、下划线外

s = 'as;g:21?32    -14h4c\n'

res = re.findall('\W',s)
print(res)  # [';', ':', '?', ' ', ' ', ' ', ' ', '-', '\n']

\s: 空

s = 'asdhg2132 14h4c'

res = re.findall('\s',s)
print(res)    # [' ']

\S : 不空

s = 'asdhg2132    -14h4c\n'

res = re.findall('\S',s)
print(res)  # ['a', 's', 'd', 'h', 'g', '2', '1', '3', '2', '-', '1', '4', 'h', '4', 'c']

+: 前面的1个字符至少1个

1 2	s = 'abcdddd abcd abc ab' print(re.findall('abc+', s))

?: 前面的1个字符0-1个

1 2	s = 'abcdddd abcd abc ab a' print(re.findall('abc?', s)) # ['abc', 'abc', 'abc', 'ab']

*: 前面的1个字符至少0个

s = 'abcdddd abcd abc ab a'
print(re.findall('abcd*', s))   # ['abcdddd', 'abcd', 'abc']
s = 'abc bbc cbc dbc'
print(re.findall('[abc]bc', s))  # ['abc', 'bbc', 'cbc']

: 中括号的都不可以

1 2	s = 'abc bbc cbc dbc' print(re.findall('[^abc]bc', s)) # ['dbc']

| : 或

1 2	s = 'abc bbc dbc' print(re.findall('abc\|bbc', s)) # ['abc', 'bbc']

{2} : 前面的字符2个

1
2
3

s = 'abcccab abcc'
print(re.findall('abc{2}', s))   # ['abcc', 'abcc'] 
print(re.findall('abc{0,2}', s))  # ['abcc', 'ab', 'abcc']

贪婪模式

(任意字符) * (0-无穷个)

1
2
3

s = 'abcdefgaaaaaaaaaaag'
print(re.findall('a.*g',s))
# ['abcdefgaaaaaaaaaaag']

非贪婪模式

(任意字符) * (0-无穷个) ？

1 2	s = 'abcdefgbbbbbbbg' print(re.findall('a.*?g',s)) # ['abcdefg']

了解：特殊构造

# a(?=\d) ：a后面是数字，但是不要数字，不消耗字符串内容
s = 'a123 aaaa a234 abc'
print(re.findall('a(?=\d)', s))  #['a', 'a']

print(re.findall('a(?=\w)', s))  #['a', 'a', 'a', 'a', 'a', 'a']

re模块常用的功能函数

compile

# 早期，re.findall不能传模式，只能用compile
s = '#@#@#@nickchen121@163.com$$$$////nick@qq.com$$#$#$[]]2287273393@162.com@$2423423lksdlfj#'
email_pattern = re.compile('\w+@\w+.com')
phone_patter = re.compile('\d{13}')
print(re.findall(email_pattern, s))

match : 从开头找一个，找的到就不找了，找不到就报错

1
2
3

s = 'abcd abcddd abc'
res = re.match('abcd*', s)
print(res.group())  # abcd

search：从字符串找一个，就不找了

1
2
3

s = 'abcd abcddd abc'
res = re.search('abcd*', s)
print(res.group())  # abcd

split 切分

1 2	s = 'adad213114242wjdnadjia1241423daj' print(re.split('\d+', s)) # ['adad', 'wjdnadjia', 'daj']

sub 替换

1 2	s = 'adad213114242wjdnadjia1241423daj' print(re.sub('\d+', ' ', s)) # adad wjdnadjia daj

subn 替换，比sub多了替换了多少次

1 2	s = 'adad213114242wjdnadjia1241423daj' print(re.subn('\d+', ' ', s)) # ('adad wjdnadjia daj', 2)

补充: re.S

s = '''abc
abcabc*abc
'''

print(re.findall('abc.abc',s ))  # ['abc*abc']   原本.不匹配换行

print(re.findall('abc.abc',s ,re.S))  # ['abc\nabc', 'abc*abc']

分组：只要括号里的

1 2	s = 'abc abcd abcddd' print(re.findall('a(.)c(d)', s)) # [('b', 'd'), ('b', 'd')]

有名分组

1 2	s = 'abc abcd abcddd' print(re.search('a(?P<name1>.)c(?P<name2>d)', s).groupdict()) # {'name1': 'b', 'name2': 'd'}

超高级用法

s = 'abc123abc123'  # c123a
print(re.sub('c(\d+)a', ' ', s))
print(re.sub('c(?P<name1>\d+)a', ' \g<name1> ', s))  # \g<name1>这个东西不能替换掉

# ab bc123
# ab 123 bc123

数据分析常用的三个模块

numpy模块

numpy简介

numpy官方文档：https://docs.scipy.org/doc/numpy/reference/?v=20190307135750

numpy是Python的一种开源的数值计算扩展库。这种库可用来存储和处理大型numpy数组，比Python自身的嵌套列表结构要高效的多（该结构也可以用来表示numpy数组）。

numpy库有两个作用：

区别于list列表，提供了数组操作、数组运算、以及统计分布和简单的数学模型
计算速度快，甚至要由于python内置的简单运算，使得其成为pandas、sklearn等模块的依赖包。高级的框架如TensorFlow、PyTorch等，其数组操作也和numpy非常相似。

numpy使用

import numpy as np

lt1 = [1,2,3]
lt2 = [4,5,6]
arry1 = np.array(lt1)
arry2 = np.array(lt2)
print(arry1*arry2)    # [ 4 10 18]

创建numpy数组

# 一维数组
arr1 = np.array([1,2,4])
print(type(arr1), arr1)  # <class 'numpy.ndarray'> [1 2 4]

# 二维数组
arr = np.array([
    [1,2,3],
    [4,5,6]
])

print(arr)  
'''
[[1 2 3]
 [4 5 6]]
'''


# 三维数组
arr = np.array([
    [[1, 2, 3],
     [4, 5, 6]],
    [[1, 2, 3],
     [4, 5, 6]]
])

print(arr)
'''
[[[1 2 3]
  [4 5 6]]

 [[1 2 3]
  [4 5 6]]]
'''

numpy数组的常用属性

属性	解释
T	数组的转置（对高维数组而言）
dtype	数组元素的数据类型
size	数组元素的个数
ndim	数组的维数
shape	数组的维度大小（以元组形式）
astype	类型转换

dtype 数组元素的数据类型，numpy数组是属于python解释器的，int32 / float64 属于numpy

# 转置
arr = np.array([
    [1, 2, 3],
    [4, 5, 6]
])

print(f'arr的转置为\n{arr.T}')
'''
arr的转置为
[[1 4]
 [2 5]
 [3 6]]
'''

# dtype数据类型
arr = np.array([
    [1., 2., 3.],
    [4, 5, 6]
])

print(arr.dtype)  # float64


# 数组元素的个数
print(arr.size)   # 6

# 数组的维数
print(arr.ndim)  # 2

# 数组的维度大小（以元组形式， 几行几列）
print(arr.shape)  # （2，3）

# 数组类型转换
arr = np.array([
    [1, 2, 3],
    [4, 5, 6]
])
res = arr.astype(np.float64)
print(res) 
'''
[[1. 2. 3.]
 [4. 5. 6.]]
'''

切割numpy数组

切分数组类似于列表的切割，numpy数组的切割涉及到行和列的切割

arr = np.array([
    [1, 2, 3],
    [4, 5, 6]
])

print(arr[:,:])  # 行；列  取整个数组
print(arr[0,0])  # 取第一行第一列，1
print(arr[0,:])  # 取第一行  [1 2 3]
print(arr[:,2:])  # 取第三列 
'''
[[3]
 [6]]
'''

赋值

arr = np.array([
    [1, 2, 3],
    [4, 5, 6]
])
arr[0,0] = 0
print(arr)   # 将二维数组第一行第一列元素赋值为0

arr[:,:] = 0
print(arr)  # 全部换为0
'''
[[0 0 0]
 [0 0 0]]
'''

元组合并

# 水平合并
arr1 = np.array([
    [1, 2, 3],
    [4, 5, 6]
])

arr2 = np.array([
    [7, 8, 9],
    ['a', 'b', 'c']
])

print(np.hstack((arr1,arr2))) # 只能放元组
'''
[['1' '2' '3' '7' '8' '9']
 ['4' '5' '6' 'a' 'b' 'c']]
'''

# 垂直合并
print(np.vstack((arr1, arr2)))
'''
[['1' '2' '3']
 ['4' '5' '6']
 ['7' '8' '9']
 ['a' 'b' 'c']]
'''

print(np.concatenate((arr1, arr2)))  # 默认以列合并
'''
[['1' '2' '3']
 ['4' '5' '6']
 ['7' '8' '9']
 ['a' 'b' 'c']]
'''
print(np.concatenate((arr1, arr2), axis=1)) # 以行合并
'''
[['1' '2' '3' '7' '8' '9']
 ['4' '5' '6' 'a' 'b' 'c']]
'''

通过函数创建数组

方法	详解
array()	将列表转换为数组，可选择显式指定dtype
arange()	range的numpy版，支持浮点数
linspace()	类似arange()，第三个参数为数组长度
zeros()	根据指定形状和dtype创建全0数组
ones()	根据指定形状和dtype创建全1数组
eye()	创建单位矩阵
empty()	创建一个元素全随机的数组
reshape()	重塑形状

ones / zeros / eye / empty

# ones
print(np.ones((2,3)))  # 构建两行三列的1
'''
[[1. 1. 1.]
 [1. 1. 1.]]
'''

# zeros
print(np.zeros((2,3)))  # 创建2行3列的0
'''
[[0. 0. 0.]
 [0. 0. 0.]]
'''

# eye
print(np.eye(3,3))  # 创建单位矩阵
'''
[[1. 0. 0.]
 [0. 1. 0.]
 [0. 0. 1.]]
'''

# empty
print(np.empty((2,2)))  # 创建一个2行2列的数组，里面的元素是随机生成的
'''
[[1.42419938e-306 9.34609790e-307]
 [1.29060871e-306 7.56601165e-307]]
'''

linspace / logspace

print(np.linspace(1,100,10)) # 创建一个等差数列，1-100，有10个数
# [  1.  12.  23.  34.  45.  56.  67.  78.  89. 100.]

print(np.logspace(1,10,5))  # 创建等比数列、
# [1.00000000e+01 1.77827941e+03 3.16227766e+05 5.62341325e+07 1.00000000e+10]

arrange

print(np.arange(2,10))
# [2 3 4 5 6 7 8 9]

print(np.arange(2,10,2))  # 2-10，步长为2的数组
# [2 4 6 8]

reshape 重构形状

arr = np.ones([2,2])  # 本来是2*2的1
# print(arr) 
print(arr.reshape(1,4))  # 变成1*4的1
# [[1. 1. 1. 1.]]

数组运算

运算符	说明
+	两个numpy数组对应元素相加
-	两个numpy数组对应元素相减
*	两个numpy数组对应元素相乘
/	两个numpy数组对应元素相除，如果都是整数则取商
%	两个numpy数组对应元素相除后取余数
**n	单个numpy数组每个元素都取n次方，如**2：每个元素都取平方

arr = np.array([
    [3, 4, 56],
    [12, 4, 25]
])
print(arr / 2)
'''
[[ 1.5  2.  28. ]
 [ 6.   2.  12.5]]
'''

arr = np.array([
    [3, 4, 56],
    [12, 4, 25]
])
print(arr ** 2)
'''
[[   9   16 3136]
 [ 144   16  625]]
'''

numpy数组函数运算

numpy数组函数	详解
np.sin(arr)	对numpy数组arr中每个元素取正弦，sin(x)sin(x)
np.cos(arr)	对numpy数组arr中每个元素取余弦，cos(x)cos(x)
np.tan(arr)	对numpy数组arr中每个元素取正切，tan(x)tan(x)
np.arcsin(arr)	对numpy数组arr中每个元素取反正弦，arcsin(x)arcsin(x)
np.arccos(arr)	对numpy数组arr中每个元素取反余弦，arccos(x)arccos(x)
np.arctan(arr)	对numpy数组arr中每个元素取反正切，arctan(x)arctan(x)
np.exp(arr)	对numpy数组arr中每个元素取指数函数，exex
np.sqrt(arr)	对numpy数组arr中每个元素开根号x−−x−−√2x2

arr = np.array([
    [3, 4, 56],
    [12, 4, 25]
])
print(np.sin(arr))
'''
[[ 0.14112001 -0.7568025  -0.521551  ]
 [-0.53657292 -0.7568025  -0.13235175]]
'''

numpy.random生成随机数

函数名称	函数功能	参数说明
rand(d0,d1,⋯,dnd0,d1,⋯,dn)	产生均匀分布的随机数	dndn为第n维数据的维度
randn(d0,d1,⋯,dnd0,d1,⋯,dn)	产生标准正态分布随机数	dndn为第n维数据的维度
randint(low[, high, size, dtype])	产生随机整数	low:最小值；high:最大值；size:数据个数
random_sample([size])	在[0,1)[0,1)内产生随机数	size为随机数的shape，可以为元祖或者列表
choice(a[, size])	从arr中随机选择指定数据	arr为1维数组；size为数组形状
uniform(low,high [,size])	给定形状产生随机数组	low为最小值；high为最大值，size为数组形状
shuffle(a)	与random.shuffle相同	a为指定数组

matplotlib模块

条形图

初始的:

from matplotlib import pyplot as plt

classes = ['3班', '4班', '5班', '6班']
students = [55, 45, 60, 50]

plt.bar(classes, students)
plt.show()

from matplotlib import pyplot as plt
from matplotlib.font_manager import FontProperties  # 修改字体

font = FontProperties(fname='C:\Windows\Fonts\simkai.ttf')  # 中文字体


classes = ['3班', '4班', '5班', '6班']
students = [55, 45, 60, 50]
classes_index = range(len(classes))

plt.bar(classes_index, students)
plt.xticks(classes_index, classes, FontProperties=font)

plt.show()

from matplotlib import pyplot as plt
from matplotlib.font_manager import FontProperties  # 修改字体

font = FontProperties(fname='C:\Windows\Fonts\simkai.ttf')  # 中文字体

plt.style.use('ggplot')  # 设置背景条纹

classes = ['3班', '4班', '5班', '6班']
students = [55, 45, 60, 50]
classes_index = range(len(classes))

plt.bar(classes_index, students, color='darkblue')

plt.xlabel('学生', FontProperties=font)
plt.ylabel('学生人数', FontProperties=font)
plt.title('班级-学生人数', FontProperties=font, fontsize=20, fontweight=25)
plt.xticks(classes_index, classes, FontProperties=font)  # 将x轴坐标替换成classes

plt.show()

直方图

import numpy as np
from matplotlib import pyplot as plt
from matplotlib.font_manager import FontProperties  # 修改字体

font = FontProperties(fname='C:\Windows\Fonts\simkai.ttf')  # 中文字体

# 修改背景为条纹
plt.style.use('ggplot')

x1 = np.random.randn(10000)  # 随机生成符合正太分布的数
x2 = np.random.randn(10000)

plt.hist(x1, bins=50, color='darkgreen')  # bins=50表示每个变量的值分成50份，即会有50根柱子
plt.show()

# 直方图
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.font_manager import FontProperties  # 修改字体

font = FontProperties(fname='C:\Windows\Fonts\simkai.ttf')  # 中文字体

# 修改背景为条纹
plt.style.use('ggplot')

x1 = np.random.randn(10000)  # 随机生成符合正太分布的数
x2 = np.random.randn(10000)

fig = plt.figure()  # 生成一张画布
ax1 = fig.add_subplot(1,2,1) # 一行两列取第一个
ax2 = fig.add_subplot(1,2,2) # 一行两列取第二个

ax1.hist(x1, bins=50, color='darkgreen')  # bins=50表示每个变量的值分成50份，即会有50根柱子
ax2.hist(x2, bins=50, color='red')

# 大标题
fig.suptitle('两个正太分布', FontProperties=font)

# 添加子标题
ax1.set_title('x1的正态分布', FontProperties=font)
ax2.set_title('x2的正态分布', FontProperties=font)
plt.show()

折线图

# 折线图
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.font_manager import FontProperties  # 修改字体

font = FontProperties(fname='C:\Windows\Fonts\simkai.ttf')  # 中文字体

# 修改背景为条纹
plt.style.use('ggplot')

np.random.seed(1)
# 使用numpy的累加和，保证数据取值范围不会在（0，1）内波动
x1 = np.random.randn(40).cumsum()
x2 = np.random.randn(40).cumsum()
x3 = np.random.randn(40).cumsum()
x4 = np.random.randn(40).cumsum()

plt.plot(x1, color='red', marker='o', linestyle='-', label='红实线')
plt.plot(x2, color='yellow', marker='x', linestyle='--', label='黄虚线')
plt.plot(x3, color='blue', marker='*', linestyle='-.', label='蓝点线')
plt.plot(x4, color='black', marker='s', linestyle=':', label='绿点图')

# loc='best'给label自动选择最好的位置
plt.legend(loc='best', prop=font)
plt.show()

散点图+直线图

# 散点图
import numpy as np
from matplotlib import pyplot as plt
from matplotlib.font_manager import FontProperties  # 修改字体
font = FontProperties(fname='C:\Windows\Fonts\simkai.ttf')

plt.style.use('ggplot')

# fig = plt.figure(figsize=(10,20)) # 控制画布大小
fig = plt.figure()
ax1 = fig.add_subplot(1,2,1)
ax2 = fig.add_subplot(1,2,2)

x1 = np.arange(20)
y1 = x1 ** 2

x2 = np.arange(20)
y2 = x2


ax1.scatter(x1, y1, color='red', label='红')
ax1.scatter(x2, y2, color='blue', label='蓝')
ax2.plot(x1,y1)
ax2.plot(x2,y2)

fig.suptitle('两张图', FontProperties=font)
ax1.set_title('散点图', FontProperties=font)
ax2.set_title('折线图', FontProperties=font)
ax1.legend(prop=font)
plt.show()

pandas模块

pandas中有两个主要的数据结构，其中Series数据结构类似于Numpy中的一维数组，DataFrame类似于多维表格数据结构。

pandas是python数据分析的核心模块。它主要提供了五大功能:

支持文件存取操作，支持数据库(sql)、html、json、pickle、csv(txt、excel)、sas、stata、hdf等。
支持增删改查、切片、高阶函数、分组聚合等单表操作，以及和dict、list的互相转换。
支持多表拼接合并操作。
支持简单的绘图操作。
支持简单的统计分析操作。

import pandas as pd
index = pd.date_range('2019-01-01', periods=6, freq='M')
print(index)

'''
DatetimeIndex(['2019-01-31', '2019-02-28', '2019-03-31', '2019-04-30',
               '2019-05-31', '2019-06-30'],
              dtype='datetime64[ns]', freq='M')
'''
import numpy as np
import pandas as pd

index = pd.date_range('2019-01-01', periods=6, freq='M')
columns = ['c1','c2','c3','c4']
# print(columns)
val = np.random.randn(6, 4)
# print(val)

df = pd.DataFrame(index=index, columns=columns, data=val)

# 保存文件，读出文件
df.to_excel('data.xls')

# 读出文件
df = pd.read_excel('data.xls', index_col=[0])
print(df)

'''
                  c1        c2        c3        c4
2019-01-31 -1.469848 -0.875899 -0.571439  0.274287
2019-02-28  0.783315 -0.333277  0.091470  1.484056
2019-03-31  0.611354  0.565803 -1.298068  0.666117
2019-04-30 -0.892975 -0.144261 -2.596248  1.103916
2019-05-31 -1.207643 -0.475502 -1.577926 -0.373518
2019-06-30 -0.233937 -1.492533 -0.207368  0.163620

'''

print(df.index)
'''
DatetimeIndex(['2019-01-31', '2019-02-28', '2019-03-31', '2019-04-30',
               '2019-05-31', '2019-06-30'],
              dtype='datetime64[ns]', freq=None)
'''

print(df.columns)
'''
Index(['c1', 'c2', 'c3', 'c4'], dtype='object')
'''


print(df[['c1', 'c2']])


# 按照index取值
print(df.loc['2019-01-31'])
'''
c1   -0.511065
c2    0.173715
c3    0.460645
c4   -0.105340
Name: 2019-01-31 00:00:00, dtype: float64
'''

# 按照values取值
print(df.iloc[0,1])
'''
-0.515215674883499
'''

df.iloc[0,:] = 0
print(df)

'''
                  c1        c2        c3        c4
2019-01-31  0.000000  0.000000  0.000000  0.000000
2019-02-28 -0.473829  0.647171 -1.026075 -0.630721
2019-03-31  1.112496  2.454119 -0.339265  0.600856
2019-04-30 -0.264615 -0.035386 -0.717795  0.320868
2019-05-31 -0.638794 -0.926775  0.247402 -0.824648
2019-06-30 -0.100243 -1.077409 -1.063229 -1.314213
'''

jieba库

jieba库一般用于分词

import jieba

res = jieba.lcut('中华人民共和国是一个伟大的国家')   # 精确模式，返回一个列表类型的分词结果
print(res)

# 打印结果：
['中华人民共和国', '是', '一个', '伟大', '的', '国家']
import jieba

res = jieba.lcut_for_search('中华人民共和国是一个伟大的国家')  # 搜索引擎模式，返回一个列表类型的分词结果，存在冗余
print(res)

# 打印结果：
['中华', '华人', '人民', '共和', '共和国', '中华人民共和国', '是', '一个', '伟大', '的', '国家']
import jieba

res = jieba.lcut('中华人民共和国是一个伟大的国家',cut_all=True) # 把所有的可能全部切出来
print(res)

# 打印结果：
['中华', '中华人民', '中华人民共和国', '华人', '人民', '人民共和国', '共和', '共和国', '国是', '一个', '伟大', '的', '国家']

wordcloud词云

import wordcloud
import jieba
from imageio import imread

mk = imread('test.png')  # 把图片读入内存
s = '''当其他人盲目的追寻真相和真实的时候，记住。万物皆虚。
当其他人受到法律和道德的束缚的时候，记住。万事皆允。
我们服侍光明却耕耘于黑暗。
真正睿智的人不会向你指明真相，而是教导你去发现真相。
世界上明明有一万种宗教，人们却用一种方式祈祷。这里没有上帝，只有属于我们自己的信条。
我们在黑暗中工作，为光明服务，我们，是刺客。'''

s_list = jieba.lcut(s)  # 把字符串切成列表
s = ' '.join(s_list)  # 把列表拼接成字符串
w = wordcloud.WordCloud(font_path='C:\Windows\Fonts\simkai.ttf', background_color='white', mask=mk)
w.generate(s)
w.to_file('set.png')

exec模块的补充

exec的作用：

可以把“字符串形式”的python代码，添加到全局空间或局部空间中

使用（传入三个参数）：

参数一：“字符串形式”的python代码
参数二：全局名称空间
参数三：局部名称空间

# 全局

# 1.文本形式的python代码
code = '''
global x
x = 10
y = 20
'''

# 2.全局名称空间
global_dict = {'x': 200}
# 3.局部名称空间
local_dict = {}

exec(code, global_dict, local_dict)

print(global_dict)

# 1.文本形式的python代码
code = '''
x = 100
y = 200

def func():
    pass

'''

# 2.全局名称空间
global_dict = {}
# 3.局部名称空间
local_dict = {}

exec(code, global_dict, local_dict)

print(local_dict)

subprocess模块

运行python的时候，我们都是在创建并运行一个进程。像Linux进程那样，一个进程可以fork一个子进程，并让这个子进程exec另外一个程序。在Python中，我们通过标准库中的subprocess包来fork一个子进程，并运行一个外部的程序。
subprocess包中定义有数个创建子进程的函数，这些函数分别以不同的方式创建子进程，所以我们可以根据需要来从中选取一个使用。另外subprocess还提供了一些管理标准流(standard stream)和管道(pipe)的工具，从而在进程间使用文本通信。

import subprocess

while True:
    cmd = input('cmd>>>: ')
    if cmd == 'q':
        break
        
    data = subprocess.Popen(
        cmd,
        shell=True,
        stdout=subprocess.PIPE,  # 返回标准输出结果
        stderr=subprocess.PIPE  # 返回标准错误结果
    )

    res = data.stdout.read() + data.stderr.read()
    print(res.decode('gbk'))

collections模块

常用数据结构
抽象基类

from collections import *

from collections import abc

__all__ = ['deque', 'defaultdict', 'namedtuple', 'UserDict', 'UserList',
            'UserString', 'Counter', 'OrderedDict', 'ChainMap']

tuple的功能
不可变对象、iterable
可以拆包
tuple不可变不是绝对的
将可变对象放在tuple
name_tuple = (“cwz”, [22, 11])
name_tuple[1].append(4)
print(name_tuple)
tuple比list好的地方
immutable(不可变对象)的重要性
性能优化
指出元素全部为immutable的tuple会作为常量在编译时确定，因此产生了如此显著的速度差异
线程安全
可以作为dict的key
拆包特性
如果拿C语言类比，tuple对应的是struct，list对应的是array

namedtuple

创建简单对象很好用，比定义类省空间

from collections import namedtuple

User = namedtuple("User", ["name", "age", "height"])
user_tuple = ("cwz", 20, 178)
user_dic = {
    "name": "cwz",
    "age": 20,
    "height": 178
}
# user = User(name="cwz", age=20, height=178)
user = User._make(user_tuple)

# _asdict将namedtuple转换成dict
user_info_dict = user._asdict()
print(user_info_dict)
print(user.name, user.age, user.height)

defaultdict __missing__

使用dict来统计字符串出现的次数

user_dict = {}
users = ["cwz", "reese", "neo", "cwz", "reese", "cwz"]
for user in users:
    user_dict.setdefault(user, 0)
    user_dict[user] += 1

print(user_dict)

from collections import defaultdict

default_dic = defaultdict(int)
users = ["cwz", "reese", "neo", "cwz", "reese", "cwz"]
for user in users:
    default_dic[user] += 1
print(default_dic)     # defaultdict(<class 'int'>, {'cwz': 3, 'reese': 2, 'neo': 1})

自定义复杂的默认dict

from collections import defaultdict

def dict_gen():
    return {
        "name": "",
        "age": 0
    }
default_dic = defaultdict(dict_gen)
print(default_dic["group"])     # {'name': '', 'age': 0}

deque 双端队列

deque是线程安全的，list不是线程安全

Counter

# 统计次数
from collections import Counter

users = ["cwz", "reese", "neo", "cwz", "reese", "cwz"]
user_counter = Counter(users)
print(user_counter) # Counter({'cwz': 3, 'reese': 2, 'neo': 1})

print(user_counter.most_common(1)) # top_n问题

OrderedDict

添加顺序是有序的

from collections import OrderedDict

user_dict = OrderedDict()
user_dict["b"] = "1"
user_dict["c"] = "2"
user_dict["a"] = "3"

print(user_dict)
# OrderedDict([('b', '1'), ('c', '2'), ('a', '3')])

pop
popitem
move_to_end

ChainMap

访问多个dict就像访问一个dict方便