TungNT (Blue)

tungnt.blue@gmail.com

User Tools

Site Tools


development:python:selemium

Differences

This shows you the differences between two versions of the page.

Link to this comparison view

Next revision
Previous revision
development:python:selemium [2024/08/07 02:28] – created tungntdevelopment:python:selemium [2024/08/15 15:40] (current) tungnt
Line 1: Line 1:
-====== Selemium ======+====== Một số function hay dùng ======         
  
 +===== Chờ 1 ID load xong =====
 +
 +
 +<file python>
 +WebDriverWait(driver, 120).until(
 +    EC.presence_of_element_located((By.ID, "page-loader"))
 +)
 +</file>
 +
 +===== Kiểm tra element có text tương ứng không =====
 +
 +<file python>
 +
 +WebDriverWait(driver, 60).until(
 +    EC.text_to_be_present_in_element((By.XPATH, '//*[@id="grid"]/div[2]/table/tbody/tr[21]/td[2]'), 'TEXT_MATCH'))  
 +</file>  
 +
 +===== Chờ một overload xong và ẩn đi =====
 +
 +<file python>
 +WebDriverWait(driver, 60).until(
 +    EC.invisibility_of_element_located((By.ID, 'spinner-loading'))
 +)
 +</file>            
 +
 +===== Chạy Javascript =====
 +
 +<file python>
 +driver.execute_script('''
 +    var fDate = $("#ctl00_Content_TransactionDetail_TxtFromDate").val();
 +    var tDate = $("#ctl00_Content_TransactionDetail_TxtToDate").val();
 +    getTranHisData($("#ctl00_Content_TransactionDetail_ListAccounts").val(), fDate, tDate);           
 +''')  
 +</file>  
 +
 +<file python>
 +driver.execute_script('''
 +    var c = document.createElement('canvas');
 +    var img = document.getElementById('ctl00_Content_Login_Captcha_Captcha');
 +    c.height = img.naturalHeight;
 +    c.width = img.naturalWidth;
 +    var ctx = c.getContext('2d');
 +    ctx.drawImage(img, 0, 0, c.width, c.height);
 +    var base64String = c.toDataURL();
 +    var c_detect = document.createElement('div'); 
 +    c_detect.setAttribute("id", "c_detect");
 +    c_detect.innerHTML = base64String; 
 +    document.body.appendChild(c_detect);             
 +''')
 +</file>
 +
 +===== Lấy element có thể click được =====
 +
 +<file python>
 +element = WebDriverWait(driver, 60).until(
 +    EC.element_to_be_clickable((By.XPATH, '//*[@id="ctl00_mainContainer_ctl00_btnFilterStmt"]'))
 +)
 +
 +element.click()
 +</file>
 +
 +====== Ví dụ ======
 +
 +**Thư viện:**
 +
 +<file python browser.py>
 +import time
 +from selenium import webdriver
 +from selenium.webdriver.chrome.options import Options
 +from selenium.common.exceptions import NoSuchElementException, TimeoutException
 +from selenium.webdriver.support.ui import WebDriverWait
 +from selenium.webdriver.support import expected_conditions as EC
 +from selenium.webdriver.common.by import By
 +
 +from django.conf import settings
 +
 +class Browser:
 +    download_path = None
 +
 +    driver = None
 +
 +    def __init__(self, download_path):
 +        self.download_path = download_path
 +
 +        self.set_driver()
 +
 +    def set_driver(self):
 +        options = Options()
 +        options.headless = True
 +        options.add_argument('--no-sandbox')
 +        options.add_argument('headless')
 +        options.add_argument("--headless=new")
 +        options.add_argument('--disable-infobars')
 +        options.add_argument('--disable-dev-shm-usage')
 +
 +        #if settings.APP_ENV == 'production':
 +        #   options.add_argument('--remote-debugging-port=9222')
 +
 +        options.add_argument("--window-size=1920,1200")
 +        options.add_argument("--disable-notifications")
 +        options.add_argument('--verbose')
 +        options.add_experimental_option("prefs", {
 +            "download.default_directory": self.download_path,
 +            "download.prompt_for_download": False,
 +            "download.directory_upgrade": True,
 +            "safebrowsing_for_trusted_sources_enabled": False,
 +            "safebrowsing.enabled": False
 +        })
 +        options.add_argument('--disable-gpu')
 +        options.add_argument('--disable-software-rasterizer')
 +        #options.add_argument("user-data-dir=chrome-data")
 +
 +        options.add_argument("start-maximized") # open Browser in maximized mode
 +        options.add_argument("--disable-extensions") # disabling extensions
 +
 +        chrome_driver_path = settings.RESOURCE_PATH + '/chromedriver/chromedriver'
 +
 +        #self.driver = webdriver.Chrome(options=options)
 +        self.driver = webdriver.Chrome(options=options, executable_path=chrome_driver_path)
 +
 +    def get_driver(self):
 +        return self.driver
 +
 +    def destroy_driver(self):
 +        self.driver.quit()
 +
 +    @staticmethod
 +    def click_by_xpath(driver, xpath):
 +        element = WebDriverWait(driver, 60).until(
 +            EC.element_to_be_clickable((By.XPATH, xpath))
 +        )
 +
 +        element.click()
 +
 +    @staticmethod
 +    def download_excel(self, timeout=25, download_func=None):
 +        seconds = 0
 +        has_file_download = False
 +        while has_file_download == False and seconds < timeout:
 +            time.sleep(1)
 +
 +            if download_func != None:
 +                has_file_download = download_func()
 +
 +            seconds += 1
 +        return has_file_download              
 +</file>  
 +
 +**Ví dụ:**
 +
 +<file python mb.py>
 +import logging
 +import datetime
 +import sys
 +import time
 +import hashlib
 +import traceback
 +
 +from bank.services.captcha import Captcha
 +from selenium.common.exceptions import NoSuchElementException, TimeoutException
 +from selenium.webdriver.support.ui import WebDriverWait
 +from selenium.webdriver.support import expected_conditions as EC
 +from selenium.webdriver.common.by import By
 +from django.conf import settings
 +from django.core.management.base import BaseCommand
 +from bank.services.browser import Browser
 +from bank.services.helper import Helper
 +
 +logger = logging.getLogger('django')
 +class Command(BaseCommand):
 +    help = 'Crawl MBBank IB'
 +    bank_no = 'MBBANK'
 +
 +    account_no_1 = '2341***9999'
 +    account_no_2 = '0680***6666'
 +    account_no_3 = '0928***8888'
 +    account_no_4 = '9501***9999'
 +    account_no_5 = '9065***1617'
 +
 +    def __init__(self):
 +        self.download_path = settings.DATA_PATH + "/mb"
 +
 +    def get_transactions(self, driver, account_no):
 +        transactions = []
 +
 +        current_page = 1
 +        page = 1
 +        while page < 100000000:
 +            time.sleep(5)
 +
 +            element = self.check_exists_by_xpath(driver,  '//*[@id="page-items"]')
 +            if element:
 +                current_page = int(element.find_element(By.CLASS_NAME, 'active').text)
 +                print('page' + str(current_page))
 +
 +            if page > current_page:
 +                break
 +
 +            page += 1
 +
 +            trans_elements = driver.find_elements(By.XPATH, '//*[@id="tbl-transaction-history"]/tbody/tr')
 +
 +            # logger.error(trans_elements)  
 +
 +            for element in trans_elements:
 +                bank_no = self.bank_no
 +                reference_id = element.find_element(By.XPATH, './/td[3]').text
 +                trans_time_o = element.find_element(By.XPATH, './/td[9]').text
 +                trans_time = trans_time_o
 +
 +                trans_time = datetime.datetime.strptime(trans_time_o, '%d/%m/%Y %H:%M:%S').strftime("%Y-%m-%d %H:%M")
 +
 +                amoutnOut = element.find_element(By.XPATH, './/td[4]').text
 +                amoutnIn = element.find_element(By.XPATH, './/td[5]').text
 +
 +                amoutnOut = int(amoutnOut.replace(',', ''))
 +                amoutnIn = int(amoutnIn.replace(',', ''))
 +
 +                amount = 0
 +                if amoutnOut :
 +                    amount = amoutnOut
 +
 +                if amoutnIn:
 +                    amount = amoutnIn
 +
 +                balance = element.find_element(By.XPATH, './/td[6]').text
 +                balance = balance.replace(' ', '').replace('VND', '').replace(',', '')
 +                content = element.find_element(By.XPATH, './/td[8]').text
 +                content = content.strip()
 +
 +                if account_no != self.account_no_4:
 +                    balance = 0
 +                
 +                checksum = hashlib.md5(
 +                    (str(reference_id) + self.bank_no + account_no + str(amount) + str(balance) + content + trans_time_o).encode(
 +                        'utf-8')).hexdigest()
 +
 +                transaction = {
 +                    'checksum': checksum,
 +                    'bank_no': bank_no,
 +                    'reference_id': reference_id,
 +                    'account_no': account_no,
 +                    'content': content,
 +                    'amount': amount,
 +                    'balance': balance,
 +                    'member_bank_no': None,
 +                    'member_account_no': None,
 +                    'member_account_name': None,
 +                    'trans_time': trans_time,
 +                }
 +
 +                print(transaction)
 +
 +            size = len(driver.find_elements(By.XPATH, '//*[@id="page-items"]/button'))
 +
 +            size = size - 1
 +
 +        return transactions
 +
 +    def check_exists_by_xpath(self, driver, xpath):
 +        try:
 +            ele = driver.find_element(By.XPATH, xpath)
 +        except NoSuchElementException:
 +            return False
 +        return ele
 +
 +    def getAccountSelect(self, account_no):
 +        if account_no == self.account_no_1:
 +            return '//*[@class="cdk-overlay-pane"][1]/div/*[contains(@class,"mat-option")][8]'
 +        elif account_no == self.account_no_2:
 +            return '//*[@class="cdk-overlay-pane"][1]/div/*[contains(@class,"mat-option")][5]'
 +        elif account_no == self.account_no_3:
 +            return '//*[@class="cdk-overlay-pane"][1]/div/*[contains(@class,"mat-option")][7]'
 +        elif account_no == self.account_no_4:
 +            return '//*[@class="cdk-overlay-pane"][1]/div/*[contains(@class,"mat-option")][1]'
 +        elif account_no == self.account_no_5:
 +            return '//*[@class="cdk-overlay-pane"][1]/div/*[contains(@class,"mat-option")][9]'                                    
 +        else:
 +            return ""
 +
 +    def get_account(self, driver, account_no):
 +        try:
 +            logging.getLogger('mb').info("Get account "+account_no)
 +
 +            transaction_url = "https://ebank.mbbank.com.vn/cp/account-info/transaction-inquiry?acctNo=" + account_no
 +
 +            driver.get(transaction_url)
 +
 +            WebDriverWait(driver, 10).until(
 +                lambda driver: len(driver.find_element(By.XPATH, '//*[@id="mat-input-0"]').get_attribute("value")) > 0)
 +
 +            WebDriverWait(driver, 60).until(
 +                EC.presence_of_element_located((By.ID, "btn-query"))
 +            )           
 +
 +            time.sleep(5) 
 +
 +            # driver.save_screenshot(settings.LOGS_PATH + '/screenshot-mb-'+account_no+'-1.png')
 +        
 +            self.click_by_xpath(driver, '//*[@name="account-name"]')
 +
 +            # driver.save_screenshot(settings.LOGS_PATH + '/screenshot-mb-'+account_no+'-2.png')
 +
 +            self.click_by_xpath(driver, self.getAccountSelect(account_no))
 +
 +            driver.save_screenshot(settings.LOGS_PATH + '/screenshot-mb-'+account_no+'-3.png')
 +
 +            tod = datetime.datetime.now()
 +            to_date = tod.strftime("%d")
 +            date_index = int(to_date) -1
 +
 +            self.click_by_xpath(driver, '//*[@id="mat-radio-3"]/label/div[1]')
 +
 +            time.sleep(5)
 +            
 +            driver.execute_script("document.getElementsByClassName('mat-datepicker-toggle')[0].click()")
 +            driver.execute_script("document.getElementsByClassName('mat-calendar-body-cell-content')["+str(date_index)+"].click();")
 +            driver.execute_script("document.getElementsByClassName('mat-card-custom')[0].getElementsByTagName('button')[3].click();")
 +
 +
 +            driver.find_element(By.XPATH, '//*[@id="btn-query"]').click()
 +
 +            WebDriverWait(driver, 60).until(
 +                EC.presence_of_element_located((By.ID, "tbl-transaction-history"))
 +            )
 +
 +            self.get_transactions(driver, account_no)           
 +        except Exception as e:
 +            print('get_account_1: ' + str(e)) 
 +
 +            logger.error(traceback.format_exc())  
 +
 +    def login(self, driver):
 +        current_url = driver.current_url
 +
 +        driver.save_screenshot(settings.LOGS_PATH + '/screenshot-mb.png')
 +
 +        WebDriverWait(driver, 30).until(
 +            EC.presence_of_element_located((By.CLASS_NAME, "image-captcha-frame"))
 +        )
 +
 +        #captcha_base64 = driver.find_element_by_css_selector('.image-captcha-frame > img').get_attribute('src')
 +        captcha_base64 = driver.find_element(By.CSS_SELECTOR, '.image-captcha-frame > img').get_attribute('src')
 +
 +        captcha = Captcha()
 +        captcha_txt = captcha.request(captcha_base64)
 +
 +        print(captcha_txt)
 +
 +        if captcha_txt == "":
 +            driver.quit()        
 +
 +        driver.find_element(By.XPATH, '//*[@id="corp-id"]').send_keys(settings.BANK['mb']['company_no'])
 +        driver.find_element(By.XPATH, '//*[@id="user-id"]').send_keys(settings.BANK['mb']['username'])
 +        driver.find_element(By.XPATH, '//*[@id="password"]').send_keys(settings.BANK['mb']['password'])
 +        driver.find_element(By.XPATH, '//*[@id="main-content"]/mbb-welcome/div/div/div[2]/div[2]/div/mbb-login/form/div/div[2]/mbb-word-captcha/div/div[2]/div/input').send_keys(captcha_txt)
 +        
 +        driver.save_screenshot(settings.LOGS_PATH + '/screenshot-mb-login.png')
 +        
 +        driver.find_element(By.XPATH, '//*[@id="login-btn"]').click()
 +
 +        driver.save_screenshot(settings.LOGS_PATH + '/screenshot-mb-login1.png')
 +
 +        WebDriverWait(driver, 30).until(EC.url_changes(current_url))
 +
 +        driver.save_screenshot(settings.LOGS_PATH + '/screenshot-mb-login2.png')
 +
 +        return driver
 +
 +    def crawl(self):
 +        browser = Browser(self.download_path)
 +
 +        driver = browser.get_driver()
 +
 +        driver.get("https://ebank.mbbank.com.vn/cp/pl/login")
 +
 +        current_url = driver.current_url
 +
 +        try:
 +            self.login(driver)
 +
 +            self.get_account(driver, self.account_no_4)
 +            
 +            self.get_account(driver, self.account_no_1)
 +            self.get_account(driver, self.account_no_2)
 +            self.get_account(driver, self.account_no_3)
 +            
 +            self.get_account(driver, self.account_no_5)
 +
 +        except Exception as e:
 +            print('MB Failed Exception: ' + str(e))
 +
 +            logger.error(traceback.format_exc())
 +
 +        self.quit(driver)
 +
 +    def click_by_xpath(self, driver, xpath):
 +        element = WebDriverWait(driver, 60).until(
 +            EC.element_to_be_clickable((By.XPATH, xpath))
 +        )
 +
 +        element.click()
 +
 +    def handle(self, *args, **kwargs):
 +        if Helper.count_process_exists('mb_v2') > 1:
 +            logging.getLogger('mb').info("Exit by process crawler runing...")
 +
 +            print("Exit by process crawler runing...")
 +
 +            sys.exit()
 +
 +        self.crawl()
 +
 +    def quit(self, driver):
 +        time.sleep(2)
 +        driver.close()
 +        driver.quit()     
 +</file>     
 +
 +
 +        
development/python/selemium.1722997723.txt.gz · Last modified: 2024/08/07 02:28 by tungnt

Donate Powered by PHP Valid HTML5 Valid CSS Driven by DokuWiki