395 lines
13 KiB
JavaScript
395 lines
13 KiB
JavaScript
// interencheresData.js
|
|
const {ScraperTools} = require('../Scraper');
|
|
const urlModule = require('url');
|
|
const moment = require('moment-timezone');
|
|
const { Console } = require('console');
|
|
const { title } = require('process');
|
|
|
|
class InterencheresData extends ScraperTools {
|
|
|
|
_Name = 'interencheres'
|
|
|
|
getUrlInfo = async (url) => {
|
|
|
|
let parsedUrl = new urlModule.URL(url);
|
|
|
|
let pathParts = parsedUrl.pathname.split('/').filter(Boolean);
|
|
|
|
let typeSale = pathParts[0];
|
|
let saleID = pathParts[1].split('-')[pathParts[1].split('-').length-1];
|
|
|
|
// if lot URL
|
|
let lotID = 0;
|
|
if(pathParts.length > 2){
|
|
lotID = pathParts[2].split('-')[1].split('.')[0];
|
|
}
|
|
|
|
return {
|
|
'typeSale': typeSale,
|
|
'saleID': saleID,
|
|
'lotID': lotID
|
|
}
|
|
}
|
|
|
|
// ## Lot
|
|
|
|
getPictures = async (page, Url) => {
|
|
|
|
let PictList = []
|
|
|
|
function checkDup() {
|
|
const toFindDuplicates = array => array.filter((item, index) => array.indexOf(item) !== index)
|
|
const duplicateElements = toFindDuplicates(PictList);
|
|
|
|
// if dupplicate pictures added in the array
|
|
if (duplicateElements.length > 0) {
|
|
|
|
// remove diplucated content
|
|
PictList = PictList.filter(function (elem, pos) {
|
|
return PictList.indexOf(elem) == pos;
|
|
})
|
|
|
|
// stop the process
|
|
return false
|
|
|
|
// no dupplicated picture
|
|
} else {
|
|
|
|
// continue the process
|
|
return true
|
|
}
|
|
}
|
|
|
|
page.on('response', async response => {
|
|
|
|
const url = response.url();
|
|
if (url.match("thumbor-indbupload.interencheres.com")) {
|
|
response.buffer().then(file => {
|
|
console.log("push "+url)
|
|
PictList.push(url)
|
|
});
|
|
}
|
|
});
|
|
|
|
console.log('go to : '+Url)
|
|
|
|
await page.goto(Url);
|
|
|
|
const picturesNumberXPath = [
|
|
"//div[contains(@class, 'pswp__counter')]"
|
|
]
|
|
let picturesNumberString = await this.getTextContent(picturesNumberXPath, page, 'picturesNumberXPath')
|
|
let picturesNumber = 100;
|
|
if(picturesNumberString != ''){
|
|
picturesNumber = parseInt(picturesNumberString.split(" / ")[1])
|
|
console.log('picturesNumber : '+picturesNumber)
|
|
}
|
|
|
|
let condition = true
|
|
let idx = 0
|
|
do {
|
|
const ButtonNextXPath = [
|
|
"//button[contains(@class, 'pswp__button--arrow--right')]"
|
|
]
|
|
await this.clickLink(ButtonNextXPath, page, 'ButtonNextXPath')
|
|
await page.waitForTimeout(300);
|
|
|
|
idx++
|
|
// if number of pictures found or 20 pictures checked
|
|
if(idx+1 == picturesNumber || idx==20)condition = false
|
|
} while (condition);
|
|
|
|
checkDup()
|
|
return PictList
|
|
|
|
}
|
|
|
|
getLotNumber = async (page) => {
|
|
|
|
const lotNumberXPath = [
|
|
'/html/body/div[1]/div/div/div/main/div/div/div[2]/div/div[1]/div[2]/div[2]/div[1]',
|
|
'/html/body/div[1]/div/div/div[1]/main/div/div/div[2]/div/div[1]/div[2]/div[1]/div[1]'
|
|
]
|
|
let lotNumberString = await this.getTextContent(lotNumberXPath, page, 'lotNumberXPath')
|
|
let lotNumber = '';
|
|
if(lotNumberString != ''){
|
|
lotNumber = lotNumberString.replace('Lot ', '');
|
|
}
|
|
|
|
return lotNumber
|
|
}
|
|
|
|
getEstimate = async (page) => {
|
|
const EstimateXPath = [
|
|
'/html/body/div[1]/div/div/div/main/div/div/div[2]/div/div[1]/div[2]/div[3]/div[2]/span',
|
|
'/html/body/div[1]/div/div/div[1]/main/div/div/div[2]/div/div[1]/div[2]/div[2]/div[2]/span'
|
|
]
|
|
|
|
let EstimateString = await this.getTextContent(EstimateXPath, page, 'EstimateXPath')
|
|
console.log('EstimateString : '+EstimateString)
|
|
|
|
let EstimateLow = 0
|
|
let EstimateHigh = 0
|
|
if(EstimateString != ''){
|
|
let matches = EstimateString.match(/(\d{1,3}(?:\s\d{3})*)/g);
|
|
|
|
if (matches) {
|
|
if (matches.length >= 2) {
|
|
EstimateLow = parseInt(matches[0].replace(/\s/g, ''), 10);
|
|
EstimateHigh = parseInt(matches[1].replace(/\s/g, ''), 10);
|
|
|
|
console.log('Low:', EstimateLow);
|
|
console.log('High:', EstimateHigh);
|
|
} else
|
|
if(matches.length == 1){
|
|
EstimateLow = parseInt(matches[0].replace(/\s/g, ''), 10);
|
|
EstimateHigh = 0;
|
|
} else {
|
|
console.log('Could not extract numbers.');
|
|
}
|
|
}
|
|
}
|
|
|
|
return {EstimateLow, EstimateHigh}
|
|
}
|
|
|
|
getDescription = async (page) => {
|
|
const DescriptionXPath = [
|
|
'//div[contains(@class, "description")]',
|
|
'/html/body/div[1]/div/div/div[2]/div/div[1]/div[2]/div/div[1]/div[2]/div[4]/div',
|
|
'/html/body/div[1]/div/div/div[1]/main/div/div/div[2]/div/div[1]/div[2]/div[4]/div',
|
|
'/html/body/div[1]/div/div/div[2]/div/div[1]/div[2]/div/div[1]/div[2]/div[3]/div'
|
|
]
|
|
let Description = await this.getTextContent(DescriptionXPath, page, 'DescriptionXPath')
|
|
|
|
return Description
|
|
}
|
|
|
|
getFees = async (page) => {
|
|
let feesText = ''
|
|
let fees = 0
|
|
const ButtonFeesXPath = [
|
|
'.//a[contains(text(),"Frais de vente")]',
|
|
'/html/body/div[1]/div/div/div[2]/div/div[1]/div[2]/div/div[1]/div[2]/div[3]/div[2]/a',
|
|
'/html/body/div[1]/div/div/div[1]/main/div/div/div[2]/div/div[1]/div[2]/div[3]/div[2]/a',
|
|
'/html/body/div[1]/div/div/div[2]/div/div[1]/div[2]/div/div[1]/div[2]/div[2]/div[2]/a'
|
|
]
|
|
if(await this.clickLink(ButtonFeesXPath, page, 'ButtonFeesXPath')){
|
|
const FeesXPath = [
|
|
"//strong[contains(text(), 'Frais de vente :')]/following-sibling::span",
|
|
'/html/body/div[1]/div/div/div[3]/div/div/div[2]/div/p[1]/p[1]/span',
|
|
]
|
|
feesText = await this.getTextContent(FeesXPath, page, 'FeesXPath')
|
|
if(feesText != ''){
|
|
feesText = feesText.replace(/[\n]/g, '').replace(/\s+/g, ' ').trim();
|
|
let matches = feesText.match(/(\d+(\.\d+)?)/)
|
|
if (matches) {
|
|
fees = matches[0];
|
|
}
|
|
}
|
|
}
|
|
|
|
return {feesText, fees}
|
|
}
|
|
|
|
getLotID = async (url) => {
|
|
|
|
let UrlInfo = await this.getUrlInfo(url);
|
|
let id_lot = UrlInfo.lotID
|
|
console.log('id_lot : '+id_lot)
|
|
return id_lot
|
|
}
|
|
|
|
getSaleIdUrl = async (url) => {
|
|
|
|
let UrlInfo = await this.getUrlInfo(url);
|
|
let id_sale = UrlInfo.saleID
|
|
|
|
let urlSale = url
|
|
|
|
// remove lot information if present
|
|
if (urlSale.includes('/lot-')) {
|
|
urlSale = url.split("/lot-")[0]
|
|
}
|
|
|
|
// remove parameters
|
|
if (urlSale.includes('?')) {
|
|
urlSale = urlSale.split("?")[0];
|
|
}
|
|
|
|
console.log('getSaleIdUrl urlSale : '+urlSale)
|
|
return {id_sale, urlSale}
|
|
}
|
|
|
|
// ## Lot List
|
|
_getLotInfoList = async (page, Elements) => {
|
|
let LotList = [];
|
|
for (let element of Elements) {
|
|
let Lot = {}
|
|
try{
|
|
let LotnameXPath = [
|
|
'.//a/div/div/div[5]/div/div[1]',
|
|
'.//a/div/div/div[4]/div/div[1]',
|
|
]
|
|
let Lotname = await this.getTextContentElement(LotnameXPath, page, element, 'LotnameXPath')
|
|
|
|
// idPlatform from the url
|
|
let LotUrlXPath = [
|
|
'.//a'
|
|
]
|
|
let urlLot = await this.getAttributeElement(LotUrlXPath, page, element, 'href', 'UrlListLot')
|
|
let match = urlLot.match(/lot-(.*).html/);
|
|
let idPlatform = match[1];
|
|
|
|
Lot = {
|
|
title: Lotname,
|
|
idPlatform: idPlatform,
|
|
platform: this._Name,
|
|
lotNumber: Lotname.split('Lot ')[1]
|
|
}
|
|
}catch(e){
|
|
console.error(e)
|
|
}
|
|
//console.log(LotList)
|
|
LotList.push(Lot);
|
|
};
|
|
|
|
return LotList;
|
|
}
|
|
|
|
getLotList = async (page) => {
|
|
|
|
let LotList = []
|
|
|
|
let NextBtn = false
|
|
do {
|
|
|
|
// extract Lot List
|
|
const LotListXPath = [
|
|
'//div[contains(@class, "sale-item-wrapper")]',
|
|
]
|
|
let Elements = await page.$x(LotListXPath[0]);
|
|
|
|
if (Elements.length > 0) {
|
|
LotList = [].concat(LotList, await this._getLotInfoList(page, Elements))
|
|
}
|
|
|
|
// search for the Button Next (only if enabled)
|
|
let NextPageButtonXPath = "//button[contains(@aria-label, 'Page suivante') and not(contains(@class, 'v-pagination__navigation--disabled'))]"
|
|
let NextPageButton = await page.$x(NextPageButtonXPath);
|
|
if (NextPageButton.length > 0) {
|
|
NextBtn = true
|
|
await NextPageButton[0].evaluate(b => b.click());
|
|
await page.waitForTimeout(1000);
|
|
console.log('Next Page')
|
|
}else{
|
|
NextBtn = false
|
|
}
|
|
|
|
|
|
} while (NextBtn);
|
|
|
|
return LotList
|
|
}
|
|
|
|
|
|
// ## Sale
|
|
|
|
getSaleTitle = async (page) => {
|
|
const SaleTitleXPath = [
|
|
'/html/body/div[1]/div/div/div/main/div/div/div/div/div/div[1]/div/div[1]/div[2]/h1/div/div/div/div/div',
|
|
'/html/body/div[1]/div/div/div/main/div/div/div/div/div/div[1]/div/div/div/h1/div/div/div/div/div',
|
|
'/html/body/div[1]/div/div/div/main/div/div/div/div/div/div[1]/div/div/div/div/h1/div/div/div/div/div',
|
|
'/html/body/div[1]/div/div/div[1]/main/div/div/div/div/div/div[1]/div/div/div/div/h1/div/div/div/div/div'
|
|
]
|
|
let SaleTitle = await this.getTextContent(SaleTitleXPath, page, 'SaleTitleXPath')
|
|
return SaleTitle
|
|
}
|
|
|
|
getSaleDate = async (page) => {
|
|
|
|
// Test if Live Sale
|
|
let BoolLive = false;
|
|
try {
|
|
const VideoXPath = [
|
|
'//*[@id="streaming-subscriber"]',
|
|
]
|
|
let VideoExists = await this.ElementExists(VideoXPath, page, 'VideoXPath')
|
|
console.log('VideoExists : '+VideoExists)
|
|
BoolLive = VideoExists
|
|
|
|
} catch (error) {}
|
|
|
|
let SaleDate;
|
|
|
|
// if futur sale
|
|
if(!BoolLive){
|
|
|
|
await page.waitForTimeout(400);
|
|
const SaleDateXPath = [
|
|
'/html/body/div[1]/div/div/div/main/div/div/div/div/div/div[1]/div/div[1]/div[2]/h2/div[2]/div/div[1]/span',
|
|
]
|
|
let SaleDateString = await this.getTextContent(SaleDateXPath, page, 'SaleDateXPath')
|
|
SaleDateString = SaleDateString.trim()
|
|
console.log('SaleDateString : '+SaleDateString)
|
|
|
|
const months = {
|
|
'janvier': '01',
|
|
'février': '02',
|
|
'mars': '03',
|
|
'avril': '04',
|
|
'mai': '05',
|
|
'juin': '06',
|
|
'juillet': '07',
|
|
'août': '08',
|
|
'septembre': '09',
|
|
'octobre': '10',
|
|
'novembre': '11',
|
|
'décembre': '12'
|
|
};
|
|
|
|
let SaleDateArr = SaleDateString.split(' ');
|
|
let day = parseInt(SaleDateArr[0].length === 1 ? '0'+SaleDateArr[0] : SaleDateArr[0]);
|
|
let month = parseInt(months[SaleDateArr[1]]);
|
|
let year = parseInt(SaleDateArr[2]);
|
|
|
|
let hour = parseInt(SaleDateArr[4].split('h')[0]);
|
|
let minute = parseInt(SaleDateArr[4].split('h')[1]);
|
|
|
|
SaleDate = moment.tz([year, month - 1, day, hour, minute], 'Europe/Paris').format();
|
|
|
|
// Live Sale
|
|
}else{
|
|
SaleDate = moment.tz('Europe/Paris').format();
|
|
}
|
|
|
|
console.log('SaleDate : '+SaleDate)
|
|
return SaleDate
|
|
|
|
}
|
|
|
|
getSaleLocation = async (page) => {
|
|
const SaleLocationXPath = [
|
|
'/html/body/div[1]/div/div/div/main/div/div/div/div/div/div[1]/div/div[1]/div[2]/h2/div[2]/div/div[2]/span',
|
|
'/html/body/div[1]/div/div/div/main/div/div/div/div/div/div[1]/div/div/div/h2/div[2]'
|
|
]
|
|
let SaleLocation = await this.getTextContent(SaleLocationXPath, page, 'SaleLocationXPath')
|
|
return SaleLocation.trim()
|
|
}
|
|
|
|
getSaleHouseName = async (page) => {
|
|
const SaleHouseNameXPath = [
|
|
'/html/body/div[1]/div/div/div/main/div/div/div/div/div/div[2]/div[2]/div/div/div/div/div[2]/div/div/div[2]/div[2]/div[2]/a',
|
|
'/html/body/div[1]/div/div/div/main/div/div/div/div/div/div[2]/div[3]/div/div/div/div/div[2]/div/div/div[2]/div[2]/div[2]/a',
|
|
'/html/body/div[1]/div/div/div/main/div/div/div/div/div/div[2]/div[2]/div/div/div/div/div[2]/div/div/div/div[2]/div[2]/a',
|
|
'/html/body/div[1]/div/div/div[1]/main/div/div/div/div/div/div[2]/div[3]/div/div/div/div/div[2]/div/div/div/div[2]/div[2]/a'
|
|
]
|
|
let SaleHouseName = await this.getTextContent(SaleHouseNameXPath, page, 'SaleHouseNameXPath')
|
|
|
|
SaleHouseName = SaleHouseName.replace(/[\n]/g, '').replace(/\s+/g, ' ').trim();
|
|
return SaleHouseName.trim()
|
|
}
|
|
|
|
}
|
|
|
|
module.exports = InterencheresData |