Backend: - Add Swagger annotations to all job handlers (GET, POST, PUT, DELETE) - Clean up job handler code Frontend: - Expand api.ts with ApiJob types, pagination, and transform function - Update footer with 'Vagas por Tecnologia' SEO links - Add robots.txt with crawler directives - Add sitemap.xml with main pages and job URLs - Change branding to GoHorse Jobs
89 lines
3.2 KiB
Python
89 lines
3.2 KiB
Python
"""
|
|
Scraper para GeekHunter - https://www.geekhunter.com.br/vagas
|
|
"""
|
|
|
|
import requests
|
|
from bs4 import BeautifulSoup
|
|
import pandas as pd
|
|
import time
|
|
|
|
# Headers para simular navegador e evitar bloqueios
|
|
HEADERS = {
|
|
'User-Agent': 'Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/120.0.0.0 Safari/537.36',
|
|
'Accept': 'text/html,application/xhtml+xml,application/xml;q=0.9,image/webp,*/*;q=0.8',
|
|
'Accept-Language': 'pt-BR,pt;q=0.9,en-US;q=0.8,en;q=0.7',
|
|
}
|
|
|
|
def scrape_geekhunter(delay: float = 2.0) -> pd.DataFrame:
|
|
"""
|
|
Raspa vagas do site GeekHunter.
|
|
|
|
Args:
|
|
delay: Tempo de espera antes da requisição (anti-bloqueio)
|
|
|
|
Returns:
|
|
DataFrame com colunas: titulo, empresa, localizacao, link
|
|
"""
|
|
url = "https://www.geekhunter.com.br/vagas"
|
|
vagas = []
|
|
|
|
try:
|
|
# Delay anti-bloqueio
|
|
time.sleep(delay)
|
|
|
|
print(f"🔍 Raspando vagas de: {url}")
|
|
response = requests.get(url, headers=HEADERS, timeout=30)
|
|
response.raise_for_status()
|
|
|
|
soup = BeautifulSoup(response.text, 'html.parser')
|
|
|
|
# Encontrar cards de vagas (ajustar seletores conforme estrutura do site)
|
|
job_cards = soup.select('.job-card') or soup.select('[class*="job"]') or soup.select('article')
|
|
|
|
for card in job_cards:
|
|
try:
|
|
# Extrair título
|
|
titulo_elem = card.select_one('h2') or card.select_one('h3') or card.select_one('.title')
|
|
titulo = titulo_elem.get_text(strip=True) if titulo_elem else "N/A"
|
|
|
|
# Extrair empresa
|
|
empresa_elem = card.select_one('.company') or card.select_one('[class*="company"]')
|
|
empresa = empresa_elem.get_text(strip=True) if empresa_elem else "N/A"
|
|
|
|
# Extrair localização
|
|
loc_elem = card.select_one('.location') or card.select_one('[class*="location"]')
|
|
localizacao = loc_elem.get_text(strip=True) if loc_elem else "Remoto"
|
|
|
|
# Extrair link
|
|
link_elem = card.select_one('a[href*="/vagas/"]') or card.select_one('a')
|
|
if link_elem:
|
|
href = link_elem.get('href', '')
|
|
link = f"https://www.geekhunter.com.br{href}" if href.startswith('/') else href
|
|
else:
|
|
link = url
|
|
|
|
vagas.append({
|
|
'titulo': titulo,
|
|
'empresa': empresa,
|
|
'localizacao': localizacao,
|
|
'link': link,
|
|
'fonte': 'GeekHunter'
|
|
})
|
|
except Exception as e:
|
|
print(f"⚠️ Erro ao processar card: {e}")
|
|
continue
|
|
|
|
print(f"✅ {len(vagas)} vagas encontradas no GeekHunter")
|
|
|
|
except requests.exceptions.RequestException as e:
|
|
print(f"❌ Erro na requisição ao GeekHunter: {e}")
|
|
except Exception as e:
|
|
print(f"❌ Erro inesperado no GeekHunter: {e}")
|
|
|
|
return pd.DataFrame(vagas)
|
|
|
|
|
|
if __name__ == "__main__":
|
|
# Teste individual do scraper
|
|
df = scrape_geekhunter()
|
|
print(df.head())
|