Table of Contents

Một số function hay dùng

Chờ 1 ID load xong

WebDriverWait(driver, 120).until(
    EC.presence_of_element_located((By.ID, "page-loader"))
)

Kiểm tra element có text tương ứng không

WebDriverWait(driver, 60).until(
    EC.text_to_be_present_in_element((By.XPATH, '//*[@id="grid"]/div[2]/table/tbody/tr[21]/td[2]'), 'TEXT_MATCH'))  

Chờ một overload xong và ẩn đi

WebDriverWait(driver, 60).until(
    EC.invisibility_of_element_located((By.ID, 'spinner-loading'))
)

Chạy Javascript

driver.execute_script('''
    var fDate = $("#ctl00_Content_TransactionDetail_TxtFromDate").val();
    var tDate = $("#ctl00_Content_TransactionDetail_TxtToDate").val();
    getTranHisData($("#ctl00_Content_TransactionDetail_ListAccounts").val(), fDate, tDate);           
''')  
driver.execute_script('''
    var c = document.createElement('canvas');
    var img = document.getElementById('ctl00_Content_Login_Captcha_Captcha');
    c.height = img.naturalHeight;
    c.width = img.naturalWidth;
    var ctx = c.getContext('2d');
    ctx.drawImage(img, 0, 0, c.width, c.height);
    var base64String = c.toDataURL();
    var c_detect = document.createElement('div'); 
    c_detect.setAttribute("id", "c_detect");
    c_detect.innerHTML = base64String; 
    document.body.appendChild(c_detect);             
''')

Lấy element có thể click được

element = WebDriverWait(driver, 60).until(
    EC.element_to_be_clickable((By.XPATH, '//*[@id="ctl00_mainContainer_ctl00_btnFilterStmt"]'))
)
 
element.click()

Ví dụ

Thư viện:

browser.py
import time
from selenium import webdriver
from selenium.webdriver.chrome.options import Options
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
 
from django.conf import settings
 
class Browser:
    download_path = None
 
    driver = None
 
    def __init__(self, download_path):
        self.download_path = download_path
 
        self.set_driver()
 
    def set_driver(self):
        options = Options()
        options.headless = True
        options.add_argument('--no-sandbox')
        options.add_argument('headless')
        options.add_argument("--headless=new")
        options.add_argument('--disable-infobars')
        options.add_argument('--disable-dev-shm-usage')
 
        #if settings.APP_ENV == 'production':
        #   options.add_argument('--remote-debugging-port=9222')
 
        options.add_argument("--window-size=1920,1200")
        options.add_argument("--disable-notifications")
        options.add_argument('--verbose')
        options.add_experimental_option("prefs", {
            "download.default_directory": self.download_path,
            "download.prompt_for_download": False,
            "download.directory_upgrade": True,
            "safebrowsing_for_trusted_sources_enabled": False,
            "safebrowsing.enabled": False
        })
        options.add_argument('--disable-gpu')
        options.add_argument('--disable-software-rasterizer')
        #options.add_argument("user-data-dir=chrome-data")
 
        options.add_argument("start-maximized") # open Browser in maximized mode
        options.add_argument("--disable-extensions") # disabling extensions
 
        chrome_driver_path = settings.RESOURCE_PATH + '/chromedriver/chromedriver'
 
        #self.driver = webdriver.Chrome(options=options)
        self.driver = webdriver.Chrome(options=options, executable_path=chrome_driver_path)
 
    def get_driver(self):
        return self.driver
 
    def destroy_driver(self):
        self.driver.quit()
 
    @staticmethod
    def click_by_xpath(driver, xpath):
        element = WebDriverWait(driver, 60).until(
            EC.element_to_be_clickable((By.XPATH, xpath))
        )
 
        element.click()
 
    @staticmethod
    def download_excel(self, timeout=25, download_func=None):
        seconds = 0
        has_file_download = False
        while has_file_download == False and seconds < timeout:
            time.sleep(1)
 
            if download_func != None:
                has_file_download = download_func()
 
            seconds += 1
        return has_file_download              

Ví dụ:

mb.py
import logging
import datetime
import sys
import time
import hashlib
import traceback
 
from bank.services.captcha import Captcha
from selenium.common.exceptions import NoSuchElementException, TimeoutException
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.common.by import By
from django.conf import settings
from django.core.management.base import BaseCommand
from bank.services.browser import Browser
from bank.services.helper import Helper
 
logger = logging.getLogger('django')
class Command(BaseCommand):
    help = 'Crawl MBBank IB'
    bank_no = 'MBBANK'
 
    account_no_1 = '2341***9999'
    account_no_2 = '0680***6666'
    account_no_3 = '0928***8888'
    account_no_4 = '9501***9999'
    account_no_5 = '9065***1617'
 
    def __init__(self):
        self.download_path = settings.DATA_PATH + "/mb"
 
    def get_transactions(self, driver, account_no):
        transactions = []
 
        current_page = 1
        page = 1
        while page < 100000000:
            time.sleep(5)
 
            element = self.check_exists_by_xpath(driver,  '//*[@id="page-items"]')
            if element:
                current_page = int(element.find_element(By.CLASS_NAME, 'active').text)
                print('page' + str(current_page))
 
            if page > current_page:
                break
 
            page += 1
 
            trans_elements = driver.find_elements(By.XPATH, '//*[@id="tbl-transaction-history"]/tbody/tr')
 
            # logger.error(trans_elements)  
 
            for element in trans_elements:
                bank_no = self.bank_no
                reference_id = element.find_element(By.XPATH, './/td[3]').text
                trans_time_o = element.find_element(By.XPATH, './/td[9]').text
                trans_time = trans_time_o
 
                trans_time = datetime.datetime.strptime(trans_time_o, '%d/%m/%Y %H:%M:%S').strftime("%Y-%m-%d %H:%M")
 
                amoutnOut = element.find_element(By.XPATH, './/td[4]').text
                amoutnIn = element.find_element(By.XPATH, './/td[5]').text
 
                amoutnOut = int(amoutnOut.replace(',', ''))
                amoutnIn = int(amoutnIn.replace(',', ''))
 
                amount = 0
                if amoutnOut :
                    amount = amoutnOut
 
                if amoutnIn:
                    amount = amoutnIn
 
                balance = element.find_element(By.XPATH, './/td[6]').text
                balance = balance.replace(' ', '').replace('VND', '').replace(',', '')
                content = element.find_element(By.XPATH, './/td[8]').text
                content = content.strip()
 
                if account_no != self.account_no_4:
                    balance = 0
 
                checksum = hashlib.md5(
                    (str(reference_id) + self.bank_no + account_no + str(amount) + str(balance) + content + trans_time_o).encode(
                        'utf-8')).hexdigest()
 
                transaction = {
                    'checksum': checksum,
                    'bank_no': bank_no,
                    'reference_id': reference_id,
                    'account_no': account_no,
                    'content': content,
                    'amount': amount,
                    'balance': balance,
                    'member_bank_no': None,
                    'member_account_no': None,
                    'member_account_name': None,
                    'trans_time': trans_time,
                }
 
                print(transaction)
 
            size = len(driver.find_elements(By.XPATH, '//*[@id="page-items"]/button'))
 
            size = size - 1
 
        return transactions
 
    def check_exists_by_xpath(self, driver, xpath):
        try:
            ele = driver.find_element(By.XPATH, xpath)
        except NoSuchElementException:
            return False
        return ele
 
    def getAccountSelect(self, account_no):
        if account_no == self.account_no_1:
            return '//*[@class="cdk-overlay-pane"][1]/div/*[contains(@class,"mat-option")][8]'
        elif account_no == self.account_no_2:
            return '//*[@class="cdk-overlay-pane"][1]/div/*[contains(@class,"mat-option")][5]'
        elif account_no == self.account_no_3:
            return '//*[@class="cdk-overlay-pane"][1]/div/*[contains(@class,"mat-option")][7]'
        elif account_no == self.account_no_4:
            return '//*[@class="cdk-overlay-pane"][1]/div/*[contains(@class,"mat-option")][1]'
        elif account_no == self.account_no_5:
            return '//*[@class="cdk-overlay-pane"][1]/div/*[contains(@class,"mat-option")][9]'                                    
        else:
            return ""
 
    def get_account(self, driver, account_no):
        try:
            logging.getLogger('mb').info("Get account "+account_no)
 
            transaction_url = "https://ebank.mbbank.com.vn/cp/account-info/transaction-inquiry?acctNo=" + account_no
 
            driver.get(transaction_url)
 
            WebDriverWait(driver, 10).until(
                lambda driver: len(driver.find_element(By.XPATH, '//*[@id="mat-input-0"]').get_attribute("value")) > 0)
 
            WebDriverWait(driver, 60).until(
                EC.presence_of_element_located((By.ID, "btn-query"))
            )           
 
            time.sleep(5) 
 
            # driver.save_screenshot(settings.LOGS_PATH + '/screenshot-mb-'+account_no+'-1.png')
 
            self.click_by_xpath(driver, '//*[@name="account-name"]')
 
            # driver.save_screenshot(settings.LOGS_PATH + '/screenshot-mb-'+account_no+'-2.png')
 
            self.click_by_xpath(driver, self.getAccountSelect(account_no))
 
            driver.save_screenshot(settings.LOGS_PATH + '/screenshot-mb-'+account_no+'-3.png')
 
            tod = datetime.datetime.now()
            to_date = tod.strftime("%d")
            date_index = int(to_date) -1
 
            self.click_by_xpath(driver, '//*[@id="mat-radio-3"]/label/div[1]')
 
            time.sleep(5)
 
            driver.execute_script("document.getElementsByClassName('mat-datepicker-toggle')[0].click()")
            driver.execute_script("document.getElementsByClassName('mat-calendar-body-cell-content')["+str(date_index)+"].click();")
            driver.execute_script("document.getElementsByClassName('mat-card-custom')[0].getElementsByTagName('button')[3].click();")
 
 
            driver.find_element(By.XPATH, '//*[@id="btn-query"]').click()
 
            WebDriverWait(driver, 60).until(
                EC.presence_of_element_located((By.ID, "tbl-transaction-history"))
            )
 
            self.get_transactions(driver, account_no)           
        except Exception as e:
            print('get_account_1: ' + str(e)) 
 
            logger.error(traceback.format_exc())  
 
    def login(self, driver):
        current_url = driver.current_url
 
        driver.save_screenshot(settings.LOGS_PATH + '/screenshot-mb.png')
 
        WebDriverWait(driver, 30).until(
            EC.presence_of_element_located((By.CLASS_NAME, "image-captcha-frame"))
        )
 
        #captcha_base64 = driver.find_element_by_css_selector('.image-captcha-frame > img').get_attribute('src')
        captcha_base64 = driver.find_element(By.CSS_SELECTOR, '.image-captcha-frame > img').get_attribute('src')
 
        captcha = Captcha()
        captcha_txt = captcha.request(captcha_base64)
 
        print(captcha_txt)
 
        if captcha_txt == "":
            driver.quit()        
 
        driver.find_element(By.XPATH, '//*[@id="corp-id"]').send_keys(settings.BANK['mb']['company_no'])
        driver.find_element(By.XPATH, '//*[@id="user-id"]').send_keys(settings.BANK['mb']['username'])
        driver.find_element(By.XPATH, '//*[@id="password"]').send_keys(settings.BANK['mb']['password'])
        driver.find_element(By.XPATH, '//*[@id="main-content"]/mbb-welcome/div/div/div[2]/div[2]/div/mbb-login/form/div/div[2]/mbb-word-captcha/div/div[2]/div/input').send_keys(captcha_txt)
 
        driver.save_screenshot(settings.LOGS_PATH + '/screenshot-mb-login.png')
 
        driver.find_element(By.XPATH, '//*[@id="login-btn"]').click()
 
        driver.save_screenshot(settings.LOGS_PATH + '/screenshot-mb-login1.png')
 
        WebDriverWait(driver, 30).until(EC.url_changes(current_url))
 
        driver.save_screenshot(settings.LOGS_PATH + '/screenshot-mb-login2.png')
 
        return driver
 
    def crawl(self):
        browser = Browser(self.download_path)
 
        driver = browser.get_driver()
 
        driver.get("https://ebank.mbbank.com.vn/cp/pl/login")
 
        current_url = driver.current_url
 
        try:
            self.login(driver)
 
            self.get_account(driver, self.account_no_4)
 
            self.get_account(driver, self.account_no_1)
            self.get_account(driver, self.account_no_2)
            self.get_account(driver, self.account_no_3)
 
            self.get_account(driver, self.account_no_5)
 
        except Exception as e:
            print('MB Failed Exception: ' + str(e))
 
            logger.error(traceback.format_exc())
 
        self.quit(driver)
 
    def click_by_xpath(self, driver, xpath):
        element = WebDriverWait(driver, 60).until(
            EC.element_to_be_clickable((By.XPATH, xpath))
        )
 
        element.click()
 
    def handle(self, *args, **kwargs):
        if Helper.count_process_exists('mb_v2') > 1:
            logging.getLogger('mb').info("Exit by process crawler runing...")
 
            print("Exit by process crawler runing...")
 
            sys.exit()
 
        self.crawl()
 
    def quit(self, driver):
        time.sleep(2)
        driver.close()
        driver.quit()