stack :
Hi I am scraping the teacher jobs from this website https://www.indeed.co.in/?r=us now I want to upload them to excel and database at once how is it possible can anyone help me I want to get like the headers title, company, salary and the data My scrap.py code for scraping is :
import selenium.webdriver
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.common.by import By
from selenium.webdriver.support import expected_conditions as EC
from selenium.webdriver.support.ui import WebDriverWait
url = 'https://www.indeed.co.in/?r=us'
driver = webdriver.Chrome(r"mypython/bin/chromedriver_linux64/chromedriver")
driver.get(url)
driver.find_element_by_xpath('//*[@id="text-input-what"]').send_keys("teacher")
driver.find_element_by_xpath('//*[@id="whatWhereFormId"]/div[3]/button').click()
# scrape data
data = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "resultsCol")))
result_set = WebDriverWait(data, 10).until(
EC.presence_of_all_elements_located((By.CLASS_NAME, "jobsearch-SerpJobCard")))
for result in result_set:
data = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "resultsCol")))
result_set = WebDriverWait(data, 10).until(
EC.presence_of_all_elements_located((By.CLASS_NAME, "jobsearch-SerpJobCard")))
for result in result_set:
title = result.find_element_by_class_name("title").text
print(title)
school = result.find_element_by_class_name("company").text
print(school)
try:
salary = result.find_element_by_class_name("salary").text
print(salary)
except:
pass
print("--------")
# move to next page
next_page = result.find_elements_by_xpath("//span[@class='pn']")[-1]
driver.execute_script("arguments[0].click();", next_page)
Prakhar Jhudele :
Hi i have made a few modifications to your code and below generates a csv with 200+ records from all the pages.
# -*- coding: utf-8 -*-
"""
Created on Thu Mar 5 15:12:04 2020
@author: prakh
"""
from selenium import webdriver
from selenium.webdriver.common.keys import Keys
from selenium.webdriver.common.by import By
from selenium.webdriver.support.ui import WebDriverWait
from selenium.webdriver.support import expected_conditions as EC
import csv
import pandas as pd
import mysql.connector
from sqlalchemy import create_engine
def smallest(a, y, z):
M = a
if y < M:
M = y
if z < M:
M = z
if y < z:
M = y
return M
url = 'https://www.indeed.co.in/?r=us'
driver = webdriver.Chrome(executable_path='C:/Users/prakh/Documents/PythonScripts/chromedriver.exe')
driver.get(url)
driver.find_element_by_xpath('//*[@id="text-input-what"]').send_keys("teacher")
driver.find_element_by_xpath('//*[@id="whatWhereFormId"]/div[3]/button').click()
# scrape data
data = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "resultsCol")))
result_set = WebDriverWait(data, 10).until(
EC.presence_of_all_elements_located((By.CLASS_NAME, "jobsearch-SerpJobCard")))
Resultfile = open("teacherr_jobs.csv",'a',encoding='utf-8',newline='')
head = csv.writer(Resultfile)
head.writerow(["Title","School","Salary"])
Resultfile.close()
titles = []
company = []
salaries = []
resultlist = []
Resultfile = open("teacherr_jobs.csv",'a',encoding='utf-8',newline='')
wr = csv.writer(Resultfile)
for result in result_set:
data = WebDriverWait(driver, 10).until(
EC.presence_of_element_located((By.ID, "resultsCol")))
result_set = WebDriverWait(data, 10).until(
EC.presence_of_all_elements_located((By.CLASS_NAME, "jobsearch-SerpJobCard")))
for result in result_set:
titles.append(result.find_element_by_class_name("title").text)
# print(title)
company.append(result.find_element_by_class_name("company").text)
# print(school)
try:
salaries.append(result.find_element_by_class_name("salary").text)
# print(salary)
except:
salaries.append('NA')
pass
print("--------")
# move to next page
next_page = result.find_elements_by_xpath("//span[@class='pn']")[-1]
driver.execute_script("arguments[0].click();", next_page)
val = smallest(len(titles),len(company),len(salaries))
print(smallest(len(titles),len(company),len(salaries)))
for i in range(0,val-1):
resultlist.append(titles[i])
resultlist.append(company[i])
resultlist.append(salaries[i])
wr.writerow(resultlist)
resultlist = []
Resultfile.close()
final_df = pd.DataFrame(
{'Title': titles,
'School': company,
'Salary': salaries
})
engine = create_engine('mysql+mysqldb://[user]:[pass]@[host]:[port]/[schema]', echo = False)
final_df.to_sql(name = 'my_table', con = engine, if_exists = 'append', index = False)
driver.quit()