notebook | 个人学习笔记

进程与线程

进程(process): 进程是操作系统进行资源分配的基本单位线程(thread): 线程是进程中的一个执行单元

可以将操作系统看做一个学校, 学校里面有很多班级(多进程), 多个班级互相独立, 每个班级中有多个学生(多线程) 多个线程之间可以共享进程的资源

多进程

multiprocessing.Process
各个进程相互独立(资源不共享)

from multiprocessing import Process
from time import sleep, time
from os import getpid, getppid

def sing(times):
    # 获取当前进程 pid
    print(f"sing pid is {getpid()}")

    # 获取当前的父进程 pid
    print(f"sing parent process pid is {getppid()}")
    for i in range(times):
        # 模拟运行时间长的代码操作
        sleep(1)
        print(f"sing-{i}")

def dance(times):
    print(f"dance pid is {getpid()}")
    print(f"dance parent process pid is {getppid()}")
    for i in range(times):
        sleep(1)
        print(f"dance-{i}")

# 只用主进程执行
def run_single_process():
    sing(3)
    dance(4)

# 多进程执行
def run_multi_process():
    # 创建子进程(使用元组传参)
    sing_process = Process(target=sing, args=(3,))

    # 使用字典传参
    dance_process = Process(target=dance, kwargs={"times": 4})

    # 子进程开始执行
    sing_process.start()
    dance_process.start()

    # 确保子进程执行完
    sing_process.join()
    dance_process.join()

if __name__ == "__main__":
    print(f"---main-start---, pid is {getpid()}")
    start_time = time()

    # run_single_process()  # 7s
    run_multi_process()  # 4s

    end_time = time()
    used_time = int(end_time - start_time) # seconds

    print(f"---main-end---, used time {used_time}")

多线程

threading.Thread
各个线程在同一个进程中, 资源是共享的

有了多进程(同时处理多个任务), 为什么还需要多线程

因为: 进程是操作系统进行资源分配的基本单位, 每开一个线程就会占用一定的资源且无法共享, 但是如果是多线程, 同属一个进程的多个线程可以共享资源, 开销更小

如果要使用多进程, 相当于学校需要增加一个班级而使用多线程, 只是在一个班级中增加了一个学生

from threading import Thread
from time import sleep, time
from os import getpid


def sing(times):
    # 获取当前进程 pid, 此时 pid 和主线程是
    # 一样的, 证明在同一个进程中
    print(f"sing pid is {getpid()}")

    for i in range(times):
        # 模拟运行时间长的代码操作
        sleep(1)
        print(f"sing-{i}")


def dance(times):
    print(f"dance pid is {getpid()}")
    for i in range(times):
        sleep(1)
        print(f"dance-{i}")


# 单线程执行
def run_single_process():
    sing(3)
    dance(4)


# 多线程执行
def run_multi_process():
    # 创建子线程(使用元组传参)
    sing_thread = Thread(target=sing, args=(3,))

    # 使用字典传参
    dance_thread = Thread(target=dance, kwargs={"times": 4})

    # 子线程开始执行
    sing_thread.start()
    dance_thread.start()

    # 确保子线程执行完
    sing_thread.join()
    dance_thread.join()


if __name__ == "__main__":
    print(f"---main-start---, pid is {getpid()}")
    start_time = time()

    # run_single_process()  # 7s
    run_multi_process()  # 4s

    end_time = time()
    used_time = int(end_time - start_time)

    print(f"---main-end---, used time {used_time}")

多线程练习

# 多线程练习(爬虫)
import requests
import json
from bs4 import BeautifulSoup
from threading import Thread


def download(url):
    resp = requests.get(url)
    return resp.text


def parse(html):
    soup = BeautifulSoup(html, "html.parser")
    links = soup.find_all("a", class_="post-item-title")
    # return [(link.attrs["href"], link.get_text()) for link in links]
    items = []
    for link in links:
        items.append(
            {
                "url": link.attrs["href"],
                "title": link.get_text(),
            }
        )
    return items


def save_json(file_path, items):
    with open(file_path, "w", encoding="utf-8") as file:
        file.write(json.dumps(items, ensure_ascii=False))


urls = [
    # urls
    f"https://www.cnblogs.com/#p{page}"
    for page in range(1, 10)
]


def run():
    i = 0
    for url in urls:
        i += 1
        file_path = f"./run-{i}.json"
        for page_items in parse(download(url)):
            print(page_items)
            save_json(file_path, page_items)
        print(f"{file_path} wirte finish")
    print("--- finish ---")


def run_with_mp():
    def task(i, url):
        file_path = f"./run-with-mp-{i}.json"
        for page_items in parse(download(url)):
            print(page_items)
            save_json(file_path, page_items)
        print(f"{file_path} wirte finish")

    i = 0
    for url in urls:
        i += 1
        t = Thread(target=task, args=(i, url))
        t.start()
        t.join()
    print("--- finish ---")


# 两个函数对比, 可以明显的感觉出来 run_with_mp 速度更快
# run()
run_with_mp()

竞争态与线程锁

数据竞争线程锁

from threading import Thread, current_thread


class Account:
    def __init__(self, balance):
        self.balance = balance


def draw(account, amount):
    if account.balance >= amount:
        print(f"{current_thread().name}-取钱成功")

        ### maybe switch thread ###
        account.balance -= amount
        print(f"{current_thread().name}-余额:{account.balance}")
    else:
        print(f"{current_thread().name}-取钱失败")
        print(f"{current_thread().name}-余额:{account.balance}")


account = Account(100)
amount  = 80
t1 = Thread(name="t1", target=draw, args=(account, amount))
t2 = Thread(name="t2", target=draw, args=(account, amount))

t1.start()
t2.start()

t1.join()
t2.join()

# 由代码输出结果可以知道:
# account.balance 有可能为 -60 这个肯定是不对的
# 造成这个结果的原因可能是, t1 进入 if 之后(此时
# 还没有执行 account.balance -= amount) 线程切换了,
# 那么 t2 在执行的时候的时候还是会进入 if 判断
# 此时数据就是不对了

print("---finish---")

from threading import Thread, Lock, current_thread


class Account:
    def __init__(self, balance):
        self.balance = balance


lock = Lock() # 线程锁
def draw(account, amount):
    # must be wait the thread lock released before access data #
    with lock:
        if account.balance >= amount:
            print(f"{current_thread().name}-取钱成功")
            account.balance -= amount
            print(f"{current_thread().name}-余额:{account.balance}")
        else:
            print(f"{current_thread().name}-取钱失败")
            print(f"{current_thread().name}-余额:{account.balance}")


account = Account(100)
amount  = 80
t1 = Thread(name="t1", target=draw, args=(account, amount))
t2 = Thread(name="t2", target=draw, args=(account, amount))

t1.start()
t2.start()

t1.join()
t2.join()

线程池

threading.ThreadPoolExecuto

使用线程池的好处

使用线程, 每次都需要新建线程销毁线程, 这个过程比较耗费资源且如果新建线程过多会导致系统卡顿, 为了解决这些问题, 就有了线程池, 线程池的原理是:

预先创建好一些线程(用完不会销毁)
数量是确定的(防止新建线程过多)

线程池线程池+线程锁

from concurrent.futures import ThreadPoolExecutor
from time import sleep, time


def read_file():
    sleep(1)
    return "read file finish"


def connect_db():
    sleep(2)
    return "connect db finish"


def send_request():
    sleep(3)
    return "send request finish"


def run_sync():
    print("--start--")
    start_time = time()

    f1 = read_file()
    print("f1 result is: ", f1)

    f2 = connect_db()
    print("f2 result is: ", f2)

    f3 = send_request()
    print("f3 result is: ", f3)

    end_time = time()
    used_time = int(end_time - start_time)
    print(f"used time: {used_time}")
    print("--end--")


pool = ThreadPoolExecutor(max_workers = 5)

def run_mp():
    print("--start--")
    start_time = time()

    f1 = pool.submit(read_file)
    f2 = pool.submit(connect_db)
    f3 = pool.submit(send_request)

    # .result 方法会阻塞当前线程
    print("f1 result is: ", f1.result())
    print("f2 result is: ", f2.result())
    print("f3 result is: ", f3.result())

    end_time = time()
    used_time = int(end_time - start_time)
    print(f"used time: {used_time}")
    print("--end--")


# run_sync()
run_mp()

from concurrent.futures import ThreadPoolExecutor
from time import sleep
from threading import Lock, current_thread


pool = ThreadPoolExecutor(max_workers=5)
lock = Lock() 


class Account:
    def __init__(self, balance):
        self.balance = balance


def draw(account, amount):
    # must be wait the thread lock released before access data #
    with lock:
        if account.balance >= amount:
            sleep(0.1)  # switch to other threads by manually
            print(f"{current_thread().name}-取钱成功")
            account.balance -= amount
            print(f"{current_thread().name}-余额:{account.balance}")
        else:
            print(f"{current_thread().name}-取钱失败")
            print(f"{current_thread().name}-余额:{account.balance}")
    return account.balance


account = Account(100)
amount = 80

f1 = pool.submit(draw, account, amount)
f2 = pool.submit(draw, account, amount)

多协程(Coroutine)

协程(纤程)也叫微线程, 它是由编码者去控制的, 并不是由操作系统去调度的, 这也是与线程/进程最明显的区别

线程: 适用于需要利用多核处理器进行并行计算的场景, 或者处理多个独立的, 计算密集型的任务, 例如: 科学计算、图形处理等领域
协程: 适用于高并发的 I/O 密集型场景, 比如文件读取, 在这些场景中, 协程可以在等待 I/O 操作完成时暂停执行, 避免浪费 CPU 时间, 从而提高程序的响应性和吞吐量

进程与线程 ​

多进程 ​

多线程 ​

多线程练习 ​

竞争态与线程锁 ​

线程池 ​

多协程(Coroutine) ​

进程与线程

多进程

多线程

多线程练习

竞争态与线程锁

线程池

多协程(Coroutine)